aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/btrfs/disk-io.c3
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/cachefiles/interface.c13
-rw-r--r--fs/cachefiles/namei.c10
-rw-r--r--fs/cachefiles/xattr.c6
-rw-r--r--fs/ceph/addr.c15
-rw-r--r--fs/cifs/file.c5
-rw-r--r--fs/dcache.c11
-rw-r--r--fs/debugfs/file.c43
-rw-r--r--fs/dlm/config.c5
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c177
-rw-r--r--fs/exec.c16
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext3/namei.c7
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/ext4.h187
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c193
-rw-r--r--fs/ext4/extents_status.c75
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c1751
-rw-r--r--fs/ext4/mballoc.c21
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/page-io.c325
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c99
-rw-r--r--fs/f2fs/data.c71
-rw-r--r--fs/f2fs/debug.c4
-rw-r--r--fs/f2fs/dir.c109
-rw-r--r--fs/f2fs/f2fs.h66
-rw-r--r--fs/f2fs/file.c58
-rw-r--r--fs/f2fs/gc.c42
-rw-r--r--fs/f2fs/inode.c13
-rw-r--r--fs/f2fs/namei.c17
-rw-r--r--fs/f2fs/node.c37
-rw-r--r--fs/f2fs/node.h68
-rw-r--r--fs/f2fs/recovery.c150
-rw-r--r--fs/f2fs/segment.c101
-rw-r--r--fs/f2fs/super.c253
-rw-r--r--fs/f2fs/xattr.c68
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/fscache/cache.c34
-rw-r--r--fs/fscache/cookie.c93
-rw-r--r--fs/fscache/fsdef.c1
-rw-r--r--fs/fscache/internal.h11
-rw-r--r--fs/fscache/main.c11
-rw-r--r--fs/fscache/netfs.c1
-rw-r--r--fs/fscache/object-list.c103
-rw-r--r--fs/fscache/object.c1106
-rw-r--r--fs/fscache/operation.c37
-rw-r--r--fs/fscache/page.c65
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/gfs2/Kconfig5
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c4
-rw-r--r--fs/gfs2/dir.c26
-rw-r--r--fs/gfs2/dir.h3
-rw-r--r--fs/gfs2/file.c69
-rw-r--r--fs/gfs2/glops.c8
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/log.c78
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c22
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c7
-rw-r--r--fs/gfs2/rgrp.c14
-rw-r--r--fs/gfs2/trans.c9
-rw-r--r--fs/jbd/transaction.c19
-rw-r--r--fs/jbd2/Kconfig6
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c184
-rw-r--r--fs/jbd2/journal.c166
-rw-r--r--fs/jbd2/recovery.c11
-rw-r--r--fs/jbd2/revoke.c49
-rw-r--r--fs/jbd2/transaction.c526
-rw-r--r--fs/jfs/jfs_metapage.c5
-rw-r--r--fs/logfs/file.c3
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/splice.c1
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/sysfs/inode.c2
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/xfs/xfs_aops.c14
-rw-r--r--fs/xfs/xfs_trace.h15
110 files changed, 4206 insertions, 3313 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
* @offset: offset in the page
*/
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* If called with zero offset, we should release
* the private state assocated with the page
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
v9fs_fscache_invalidate_page(page);
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
#include "internal.h"
static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static int afs_launder_page(struct page *page);
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
- _enter("{%lu},%lu", page->index, offset);
+ _enter("{%lu},%u,%u", page->index, offset, length);
BUG_ON(!PageLocked(page));
/* we clean up only if the entire page is being invalidated */
- if (offset == 0) {
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8b60b660c8f..b0292b3ead54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
return try_release_extent_buffer(page);
}
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7e7afb4a872..6bca9472f313 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2957,7 +2957,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a46b656d08de..4f9d16b70d3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7493,7 +7493,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
}
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree;
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..f93392e2df12 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1454,7 +1454,8 @@ static void discard_buffer(struct buffer_head * bh)
* block_invalidatepage - invalidate part or all of a buffer-backed page
*
* @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
* block_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
@@ -1465,15 +1466,22 @@ static void discard_buffer(struct buffer_head * bh)
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
+ unsigned int stop = length + offset;
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
goto out;
+ /*
+ * Check for overflow
+ */
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
@@ -1481,6 +1489,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
next = bh->b_this_page;
/*
+ * Are we still fully in range ?
+ */
+ if (next_off > stop)
+ goto out;
+
+ /*
* is this block fully invalidated?
*/
if (offset <= curr_off)
@@ -1501,6 +1515,7 @@ out:
}
EXPORT_SYMBOL(block_invalidatepage);
+
/*
* We attach and possibly dirty the buffers atomically wrt
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2841,7 +2856,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
* they may have been added in ext3_writepage(). Make them
* freeable here, so the page does not leak.
*/
- do_invalidatepage(page, 0);
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
#include <linux/mount.h>
#include "internal.h"
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
struct cachefiles_lookup_data {
struct cachefiles_xattr *auxdata; /* auxiliary data */
char *key; /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
object = container_of(_object, struct cachefiles_object, fscache);
cache = container_of(object->fscache.cache, struct cachefiles_cache,
cache);
+
+ if (!fscache_use_cookie(_object)) {
+ _leave(" [relinq]");
+ return;
+ }
+
cookie = object->fscache.cookie;
if (!cookie->def->get_aux) {
+ fscache_unuse_cookie(_object);
_leave(" [no aux]");
return;
}
auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
if (!auxdata) {
+ fscache_unuse_cookie(_object);
_leave(" [nomem]");
return;
}
auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+ fscache_unuse_cookie(_object);
ASSERTCMP(auxlen, <, 511);
auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
#endif
/* delete retired objects */
- if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+ if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
_object != cache->cache.fsdef
) {
_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
printk(KERN_ERR "%sobject: OBJ%x\n",
prefix, object->fscache.debug_id);
printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
- prefix, fscache_object_states[object->fscache.state],
+ prefix, object->fscache.state->name,
object->fscache.flags, work_busy(&object->fscache.work),
object->fscache.events, object->fscache.event_mask);
printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
found_dentry:
kdebug("preemptive burial: OBJ%x [%s] %p",
object->fscache.debug_id,
- fscache_object_states[object->fscache.state],
+ object->fscache.state->name,
dentry);
- if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
wait_for_old_object:
- if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_live(&object->fscache)) {
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"
" Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
// dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
/* look up the victim */
- mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
struct dentry *dentry = object->dentry;
int ret;
- ASSERT(object->fscache.cookie);
ASSERT(dentry);
_enter("%p,#%d", object, auxdata->len);
/* attempt to install the cache metadata directly */
- _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+ _debug("SET #%u", auxdata->len);
ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
&auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..38b5c1bc6776 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
* dirty page counters appropriately. Only called if there is private
* data on the page.
*/
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
if (!PageDirty(page))
pr_err("%p invalidatepage %p page not dirty\n", inode, page);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
ci = ceph_inode(inode);
- if (offset == 0) {
- dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
- inode, page, page->index, offset);
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ dout("%p invalidatepage %p idx %lu full dirty page\n",
+ inode, page, page->index);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
page->private = 0;
ClearPagePrivate(page);
} else {
- dout("%p invalidatepage %p idx %lu partial dirty page\n",
- inode, page, page->index);
+ dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
+ inode, page, page->index, offset, length);
}
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0630710a9c3f..91d8629e69a2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3547,11 +3547,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
return cifs_fscache_release_page(page, gfp);
}
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 3199fe6863a8..87bdb5329c3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
* If a dentry was found and moved, then it is returned. Otherwise NULL
* is returned. This matches the expected return value of ->lookup.
*
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
@@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
security_d_instantiate(dentry, inode);
d_rehash(dentry);
}
- } else
- d_add(dentry, inode);
+ } else {
+ d_instantiate(dentry, inode);
+ if (d_unhashed(dentry))
+ d_rehash(dentry);
+ }
return new;
}
EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/atomic.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+ atomic_set((atomic_t *)data, val);
+ return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+ *val = atomic_read((atomic_t *)data);
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+ debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+ struct dentry *parent, atomic_t *value)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value,
+ &fops_atomic_t_wo);
+
+ return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
static ssize_t read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
+ buf[buf_size] = '\0';
if (strtobool(buf, &bv) == 0)
*val = bv;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
const char *buf, size_t len)
{
- strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
- strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+ strlcpy(dlm_config.ci_cluster_name, buf,
+ sizeof(dlm_config.ci_cluster_name));
+ strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
return len;
}
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
if (b == 1) {
int len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > r->res_ls->ls_lvblen)
+ len = r->res_ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
lkb->lkb_lvbseq = ms->m_lvbseq;
}
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (!lkb->lkb_lvbptr)
return -ENOMEM;
len = receive_extralen(ms);
- if (len > DLM_RESNAME_MAXLEN)
- len = DLM_RESNAME_MAXLEN;
+ if (len > ls->ls_lvblen)
+ len = ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
}
return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
void dlm_stop_lockspaces(void)
{
struct dlm_ls *ls;
+ int count;
restart:
+ count = 0;
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
- if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+ if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+ count++;
continue;
+ }
spin_unlock(&lslist_lock);
log_error(ls, "no userland control daemon, stopping lockspace");
dlm_ls_stop(ls);
goto restart;
}
spin_unlock(&lslist_lock);
+
+ if (count)
+ log_print("dlm user daemon left %d lockspaces", count);
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2fd79eb..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
#include <linux/mutex.h>
#include <linux/sctp.h>
#include <linux/slab.h>
-#include <linux/sctp.h>
#include <net/sctp/sctp.h>
#include <net/ipv6.h>
@@ -126,6 +125,7 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
+ bool try_new_addr;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -144,6 +144,7 @@ struct dlm_node_addr {
struct list_head list;
int nodeid;
int addr_count;
+ int curr_addr_index;
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
};
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
}
static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
- struct sockaddr *sa_out)
+ struct sockaddr *sa_out, bool try_new_addr)
{
struct sockaddr_storage sas;
struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
- if (na && na->addr_count)
- memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+ if (na && na->addr_count) {
+ if (try_new_addr) {
+ na->curr_addr_index++;
+ if (na->curr_addr_index == na->addr_count)
+ na->curr_addr_index = 0;
+ }
+
+ memcpy(&sas, na->addr[na->curr_addr_index ],
+ sizeof(struct sockaddr_storage));
+ }
spin_unlock(&dlm_node_addrs_spin);
if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
{
struct dlm_node_addr *na;
int rv = -EEXIST;
+ int addr_i;
spin_lock(&dlm_node_addrs_spin);
list_for_each_entry(na, &dlm_node_addrs, list) {
if (!na->addr_count)
continue;
- if (!addr_compare(na->addr[0], addr))
- continue;
-
- *nodeid = na->nodeid;
- rv = 0;
- break;
+ for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+ if (addr_compare(na->addr[addr_i], addr)) {
+ *nodeid = na->nodeid;
+ rv = 0;
+ goto unlock;
+ }
+ }
}
+unlock:
spin_unlock(&dlm_node_addrs_spin);
return rv;
}
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
static void sctp_init_failed_foreach(struct connection *con)
{
+
+ /*
+ * Don't try to recover base con and handle race where the
+ * other node's assoc init creates a assoc and we get that
+ * notification, then we get a notification that our attempt
+ * failed due. This happens when we are still trying the primary
+ * address, but the other node has already tried secondary addrs
+ * and found one that worked.
+ */
+ if (!con->nodeid || con->sctp_assoc)
+ return;
+
+ log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+ con->try_new_addr = true;
con->sctp_assoc = 0;
- if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+ if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
mutex_unlock(&connections_lock);
}
+static void retry_failed_sctp_send(struct connection *recv_con,
+ struct sctp_send_failed *sn_send_failed,
+ char *buf)
+{
+ int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+ struct dlm_mhandle *mh;
+ struct connection *con;
+ char *retry_buf;
+ int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+ log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ log_print("Could not look up con for nodeid %d\n",
+ nodeid);
+ return;
+ }
+
+ mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+ if (!mh) {
+ log_print("Could not allocate buf for retry.");
+ return;
+ }
+ memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+ dlm_lowcomms_commit_buffer(mh);
+
+ /*
+ * If we got a assoc changed event before the send failed event then
+ * we only need to retry the send.
+ */
+ if (con->sctp_assoc) {
+ if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+ queue_work(send_workqueue, &con->swork);
+ } else
+ sctp_init_failed_foreach(con);
+}
+
/* Something happened to an association */
static void process_sctp_notification(struct connection *con,
struct msghdr *msg, char *buf)
{
union sctp_notification *sn = (union sctp_notification *)buf;
- if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+ switch (sn->sn_header.sn_type) {
+ case SCTP_SEND_FAILED:
+ retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+ break;
+ case SCTP_ASSOC_CHANGE:
switch (sn->sn_assoc_change.sac_state) {
-
case SCTP_COMM_UP:
case SCTP_RESTART:
{
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
log_print("connecting to %d sctp association %d",
nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
+ new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+ new_con->try_new_addr = false;
/* Send any pending writes */
clear_bit(CF_CONNECT_PENDING, &new_con->flags);
- clear_bit(CF_INIT_PENDING, &con->flags);
+ clear_bit(CF_INIT_PENDING, &new_con->flags);
if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
queue_work(send_workqueue, &new_con->swork);
}
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
}
break;
- /* We don't know which INIT failed, so clear the PENDING flags
- * on them all. if assoc_id is zero then it will then try
- * again */
-
case SCTP_CANT_STR_ASSOC:
{
+ /* Will retry init when we get the send failed notification */
log_print("Can't start SCTP association - retrying");
- sctp_init_failed();
}
break;
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
(int)sn->sn_assoc_change.sac_assoc_id,
sn->sn_assoc_change.sac_state);
}
+ default:
+ ; /* fall through */
}
}
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
kfree(e);
}
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+ e->offset += completed;
+ e->len -= completed;
+
+ if (e->len == 0 && e->users == 0) {
+ list_del(&e->list);
+ free_entry(e);
+ }
+}
+
/* Initiate an SCTP association.
This is a special case of send_to_sock() in that we don't yet have a
peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
int addrlen;
struct kvec iov[1];
+ mutex_lock(&con->sock_mutex);
if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
- return;
-
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
+ goto unlock;
- if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+ if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+ con->try_new_addr)) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
base_con = nodeid2con(0, 0);
BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
if (list_empty(&con->writequeue)) {
spin_unlock(&con->writequeue_lock);
log_print("writequeue empty for nodeid %d", con->nodeid);
- return;
+ goto unlock;
}
e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
- spin_unlock(&con->writequeue_lock);
/* Send the first block off the write queue */
iov[0].iov_base = page_address(e->page)+offset;
iov[0].iov_len = len;
+ spin_unlock(&con->writequeue_lock);
+
+ if (rem_addr.ss_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+ log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+ log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+ }
cmsg = CMSG_FIRSTHDR(&outmessage);
cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
sinfo = CMSG_DATA(cmsg);
memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
- sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+ sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
outmessage.msg_controllen = cmsg->cmsg_len;
+ sinfo->sinfo_flags |= SCTP_ADDR_OVER;
ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
}
else {
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
spin_unlock(&con->writequeue_lock);
}
+
+unlock:
+ mutex_unlock(&con->sock_mutex);
}
/* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+ result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
int result = -EINVAL, num = 1, i, addr_len;
struct connection *con = nodeid2con(0, GFP_NOFS);
int bufsize = NEEDED_RMEM;
+ int one = 1;
if (!con)
return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
goto create_delsock;
}
+ result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+ sizeof(one));
+ if (result < 0)
+ log_print("Could not set SCTP NODELAY error %d\n", result);
+
/* Init con struct */
sock->sk->sk_user_data = con;
con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
}
spin_lock(&con->writequeue_lock);
- e->offset += ret;
- e->len -= ret;
-
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
- free_entry(e);
- }
+ writequeue_entry_complete(e, ret);
}
spin_unlock(&con->writequeue_lock);
out:
diff --git a/fs/exec.c b/fs/exec.c
index 0f6c96c57b2f..03b907cfd765 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1137,13 +1137,6 @@ void setup_new_exec(struct linux_binprm * bprm)
set_dumpable(current->mm, suid_dumpable);
}
- /*
- * Flush performance counters when crossing a
- * security domain:
- */
- if (!get_dumpable(current->mm))
- perf_event_exit_task(current);
-
/* An exec changes our domain. We are no longer part of the thread
group */
@@ -1207,6 +1200,15 @@ void install_exec_creds(struct linux_binprm *bprm)
commit_creds(bprm->cred);
bprm->cred = NULL;
+
+ /*
+ * Disable monitoring for regular users
+ * when executing setuid binaries. Must
+ * wait until new credentials are committed
+ * by commit_creds() above
+ */
+ if (get_dumpable(current->mm) != SUID_DUMP_USER)
+ perf_event_exit_task(current);
/*
* cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
return 0;
}
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+ EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+ page->index, offset, length);
WARN_ON(1);
}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..f67668f724ba 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
}
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT3_JOURNAL(page->mapping->host);
- trace_ext3_invalidatepage(page, offset);
+ trace_ext3_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- journal_invalidatepage(journal, page, offset);
+ journal_invalidatepage(journal, page, offset, length);
}
static int ext3_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7523c61f796c..998ea111e537 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
+((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse (bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext3fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..58339393fa6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
static inline int test_root(ext4_group_t a, int b)
{
- int num = b;
-
- while (a > num)
- num *= b;
- return num == a;
+ while (1) {
+ if (a < b)
+ return 0;
+ if (a == b)
+ return 1;
+ if ((a % b) != 0)
+ return 0;
+ a = a / b;
+ }
}
static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4af03ea84aa3..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
};
/*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
-/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_DIRECT 0x0004
+#define EXT4_IO_END_DIRECT 0x0002
/*
- * For converting uninitialized extents on a work queue.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
*/
typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
+ handle_t *handle; /* handle reserved for extent
+ * conversion */
struct inode *inode; /* file being written to */
+ struct bio *bio; /* Linked list of completed
+ * bios covering the extent */
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
+ atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -581,11 +571,6 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
/*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
-
-/*
* ioctl commands
*/
#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@ struct ext4_inode_info {
rwlock_t i_es_lock;
struct list_head i_es_lru;
unsigned int i_es_lru_nr; /* protected by i_es_lock */
+ unsigned long i_touch_when; /* jiffies of last accessing */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -903,12 +889,22 @@ struct ext4_inode_info {
qsize_t i_reserved_quota;
#endif
- /* completed IOs that might need unwritten extents handling */
- struct list_head i_completed_io_list;
+ /* Lock protecting lists below */
spinlock_t i_completed_io_lock;
+ /*
+ * Completed IOs that need unwritten extents handling and have
+ * transaction reserved
+ */
+ struct list_head i_rsv_conversion_list;
+ /*
+ * Completed IOs that need unwritten extents handling and don't have
+ * transaction reserved
+ */
+ struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
- struct work_struct i_unwritten_work; /* deferred extent conversion */
+ struct work_struct i_rsv_conversion_work;
+ struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -1245,7 +1241,6 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
- unsigned int s_max_writeback_mb_bump;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for dio unwritten */
- struct workqueue_struct *dio_unwritten_wq;
+ /* workqueue for unreserved extent convertions (dio) */
+ struct workqueue_struct *unrsv_conversion_wq;
+ /* workqueue for reserved extent conversions (buffered io) */
+ struct workqueue_struct *rsv_conversion_wq;
/* timer for periodic error stats printing */
struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
+ unsigned long s_es_last_sorted;
struct percpu_counter s_extent_cache_cnt;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ /* Writeback has to have coversion transaction reserved */
+ WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+ !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -1999,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags);
+extern int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16]);
+
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
- __LINE__, ## message)
extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
extern __printf(4, 5)
void __ext4_abort(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
- __LINE__, ## message)
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
-#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
- __LINE__, ## message)
extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
- __LINE__, msg)
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
struct super_block *, ext4_group_t,
unsigned long, ext4_fsblk_t,
const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
- __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+ __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...) \
+ __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...) \
+ __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...) \
+ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...) \
+ __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+ fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, " "); \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_file(file, "", 0, block, " "); \
+} while (0)
+#define ext4_error(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error(sb, "", 0, " "); \
+} while (0)
+#define ext4_abort(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_abort(sb, "", 0, " "); \
+} while (0)
+#define ext4_warning(sb, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_warning(sb, "", 0, " "); \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_msg(sb, "", " "); \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg) \
+ __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
+} while (0)
+
+#endif
+
extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
{
struct ext4_group_info ***grp_info;
long indexv, indexh;
+ BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2598,8 +2656,7 @@ struct ext4_extent;
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
- int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern void ext4_end_io_work(struct work_struct *work);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
extern int ext4_mmp_csum_verify(struct super_block *sb,
struct mmp_struct *mmp);
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
- = BH_JBDPrivateStart,
+ = BH_JBDPrivateStart,
BH_AllocFromCluster, /* allocated blocks were part of already
- * allocated cluster. Note that this flag will
- * never, ever appear in a buffer_head's state
- * flag. See EXT4_MAP_FROM_CLUSTER to see where
- * this is used. */
+ * allocated cluster. */
};
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
/*
* Wrappers for jbd2_journal_start/end.
*/
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks)
+static int ext4_journal_check_start(struct super_block *sb)
{
journal_t *journal;
might_sleep();
-
- trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
+ return -EROFS;
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
- return ext4_get_nojournal();
/*
* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly.
*/
- if (is_journal_aborted(journal)) {
+ if (journal && is_journal_aborted(journal)) {
ext4_abort(sb, "Detected aborted journal");
- return ERR_PTR(-EROFS);
+ return -EROFS;
}
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+ return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+ int type, int blocks, int rsv_blocks)
+{
+ journal_t *journal;
+ int err;
+
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ journal = EXT4_SB(sb)->s_journal;
+ if (!journal)
+ return ext4_get_nojournal();
+ return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+ type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
return err;
}
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type)
+{
+ struct super_block *sb;
+ int err;
+
+ if (!ext4_handle_valid(handle))
+ return ext4_get_nojournal();
+
+ sb = handle->h_journal->j_private;
+ trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+ _RET_IP_);
+ err = ext4_journal_check_start(sb);
+ if (err < 0) {
+ jbd2_journal_free_reserved(handle);
+ return ERR_PTR(err);
+ }
+
+ err = jbd2_journal_start_reserved(handle, type, line);
+ if (err < 0)
+ return ERR_PTR(err);
+ return handle;
+}
+
void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn, struct buffer_head *bh,
handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
#define EXT4_HT_MIGRATE 8
#define EXT4_HT_MOVE_EXTENTS 9
#define EXT4_HT_XATTR 10
-#define EXT4_HT_MAX 11
+#define EXT4_HT_EXT_CONVERT 11
+#define EXT4_HT_MAX 12
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int nblocks);
+ int type, int blocks, int rsv_blocks);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int nblocks)
+ int blocks, int rsv_blocks)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+ return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ rsv_blocks);
}
#define ext4_journal_stop(handle) \
__ext4_journal_stop(__func__, __LINE__, (handle))
+#define ext4_journal_start_reserved(handle, type) \
+ __ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+ int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_free_reserved(handle);
+}
+
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..7097b0f680e6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
next_del = ext4_find_delayed_extent(inode, &es);
if (!exists && next_del) {
exists = 1;
- flags |= FIEMAP_EXTENT_DELALLOC;
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
}
up_read(&EXT4_I(inode)->i_data_sem);
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
}
/*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
*
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
*
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
*/
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
depth = ext_depth(inode);
- if (chunk)
+ if (extents <= 1)
index = depth * 2;
else
index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
return index;
}
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ return EXT4_FREE_BLOCKS_FORGET;
+ return 0;
+}
+
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
- ext4_fsblk_t *partial_cluster,
+ long long *partial_cluster,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
- int flags = 0;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
- else if (ext4_should_journal_data(inode))
- flags |= EXT4_FREE_BLOCKS_FORGET;
+ int flags = get_default_free_blocks_flags(inode);
/*
* For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* partial cluster here.
*/
pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ if ((*partial_cluster > 0) &&
+ (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
+ unsigned int unaligned;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
- ext_debug("free last %u blocks starting %llu\n", num, pblk);
+ /*
+ * Usually we want to free partial cluster at the end of the
+ * extent, except for the situation when the cluster is still
+ * used by any other extent (partial_cluster is negative).
+ */
+ if (*partial_cluster < 0 &&
+ -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+ ext_debug("free last %u blocks starting %llu partial %lld\n",
+ num, pblk, *partial_cluster);
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
/*
* If the block range to be freed didn't start at the
* beginning of a cluster, and we removed the entire
- * extent, save the partial cluster here, since we
- * might need to delete if we determine that the
- * truncate operation has removed all of the blocks in
- * the cluster.
+ * extent and the cluster is not used by any other extent,
+ * save the partial cluster here, since we might need to
+ * delete if we determine that the truncate operation has
+ * removed all of the blocks in the cluster.
+ *
+ * On the other hand, if we did not manage to free the whole
+ * extent, we have to mark the cluster as used (store negative
+ * cluster number in partial_cluster).
*/
- if (pblk & (sbi->s_cluster_ratio - 1) &&
- (ee_len == num))
+ unaligned = pblk & (sbi->s_cluster_ratio - 1);
+ if (unaligned && (ee_len == num) &&
+ (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
*partial_cluster = EXT4_B2C(sbi, pblk);
- else
+ else if (unaligned)
+ *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+ else if (*partial_cluster > 0)
*partial_cluster = 0;
- } else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- /* head removal */
- ext4_lblk_t num;
- ext4_fsblk_t start;
-
- num = to - from;
- start = ext4_ext_pblock(ex);
-
- ext_debug("free first %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, NULL, start, num, flags);
-
- } else {
- printk(KERN_INFO "strange request: removal(2) "
- "%u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
- }
+ } else
+ ext4_error(sbi->s_sb, "strange request: removal(2) "
+ "%u-%u from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* @handle: The journal handle
* @inode: The files inode
* @path: The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ * has been released from it. It gets negative in case
+ * that the cluster is still used.
* @start: The first block to remove
* @end: The last block to remove
*/
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+ struct ext4_ext_path *path,
+ long long *partial_cluster,
ext4_lblk_t start, ext4_lblk_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
+ ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
return -EIO;
}
/* find where to start removing */
- ex = EXT_LAST_EXTENT(eh);
+ ex = path[depth].p_ext;
+ if (!ex)
+ ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
/* If this extent is beyond the end of the hole, skip it */
if (end < ex_ee_block) {
+ /*
+ * We're going to skip this extent and move to another,
+ * so if this extent is not cluster aligned we have
+ * to mark the current cluster as used to avoid
+ * accidentally freeing it later on
+ */
+ pblk = ext4_ext_pblock(ex);
+ if (pblk & (sbi->s_cluster_ratio - 1))
+ *partial_cluster =
+ -((long long)EXT4_B2C(sbi, pblk));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
sizeof(struct ext4_extent));
}
le16_add_cpu(&eh->eh_entries, -1);
- } else
+ } else if (*partial_cluster > 0)
*partial_cluster = 0;
err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
err = ext4_ext_correct_indexes(handle, inode, path);
/*
- * If there is still a entry in the leaf node, check to see if
- * it references the partial cluster. This is the only place
- * where it could; if it doesn't, we can free the cluster.
+ * Free the partial cluster only if the current extent does not
+ * reference it. Otherwise we might free used cluster.
*/
- if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+ if (*partial_cluster > 0 &&
(EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
*partial_cluster)) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
- ext4_fsblk_t partial_cluster = 0;
+ long long partial_cluster = 0;
handle_t *handle;
int i = 0, err = 0;
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
return PTR_ERR(handle);
again:
- trace_ext4_ext_remove_space(inode, start, depth);
+ trace_ext4_ext_remove_space(inode, start, end, depth);
/*
* Check if we are removing extents inside the extent tree. If that
@@ -2844,17 +2866,14 @@ again:
}
}
- trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
- path->p_hdr->eh_entries);
+ trace_ext4_ext_remove_space_done(inode, start, end, depth,
+ partial_cluster, path->p_hdr->eh_entries);
/* If we still have something in the partial cluster and we have removed
* even the first extent, then we should free the blocks in the partial
* cluster as well. */
- if (partial_cluster && path->p_hdr->eh_entries == 0) {
- int flags = EXT4_FREE_BLOCKS_FORGET;
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+ int flags = get_default_free_blocks_flags(inode);
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4363,7 +4382,7 @@ out2:
}
out3:
- trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+ trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
return err ? err : allocated;
}
@@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(file, offset, len);
+ return ext4_punch_hole(inode, offset, len);
ret = ext4_convert_inline_data(inode);
if (ret)
@@ -4548,10 +4567,9 @@ retry:
* function, to convert the fallocated extents after IO is completed.
* Returns 0 on success.
*/
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
- ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
{
- handle_t *handle;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
@@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
map.m_lblk);
/*
- * credits to insert 1 extent into extent tree
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
*/
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ credits = 0;
+ } else {
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ }
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
inode->i_ino, map.m_lblk,
map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- if (ret <= 0 || ret2 )
+ if (credits)
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2)
break;
}
+ if (!credits)
+ ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
@@ -4659,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
error = ext4_get_inode_loc(inode, &iloc);
if (error)
return error;
- physical = iloc.bh->b_blocknr << blockbits;
+ physical = (__u64)iloc.bh->b_blocknr << blockbits;
offset = EXT4_GOOD_OLD_INODE_SIZE +
EXT4_I(inode)->i_extra_isize;
physical += offset;
@@ -4667,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
} else { /* external block */
- physical = EXT4_I(inode)->i_file_acl << blockbits;
+ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
* Ext4 extents status tree core functions.
*/
#include <linux/rbtree.h>
+#include <linux/list_sort.h>
#include "ext4.h"
#include "extents_status.h"
#include "ext4_extents.h"
@@ -291,7 +292,6 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}
@@ -672,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
@@ -734,7 +733,6 @@ out:
read_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_lru_add(inode);
trace_ext4_es_lookup_extent_exit(inode, es, found);
return found;
}
@@ -878,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
EXTENT_STATUS_WRITTEN);
}
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct ext4_inode_info *eia, *eib;
+ eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+ eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+ if (eia->i_touch_when == eib->i_touch_when)
+ return 0;
+ if (time_after(eia->i_touch_when, eib->i_touch_when))
+ return 1;
+ else
+ return -1;
+}
+
static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
{
struct ext4_sb_info *sbi = container_of(shrink,
struct ext4_sb_info, s_es_shrinker);
struct ext4_inode_info *ei;
- struct list_head *cur, *tmp, scanned;
+ struct list_head *cur, *tmp;
+ LIST_HEAD(skiped);
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk = 0;
@@ -893,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (!nr_to_scan)
return ret;
- INIT_LIST_HEAD(&scanned);
-
spin_lock(&sbi->s_es_lru_lock);
+
+ /*
+ * If the inode that is at the head of LRU list is newer than
+ * last_sorted time, that means that we need to sort this list.
+ */
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
+ if (sbi->s_es_last_sorted < ei->i_touch_when) {
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ }
+
list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
- list_move_tail(cur, &scanned);
+ /*
+ * If we have already reclaimed all extents from extent
+ * status tree, just stop the loop immediately.
+ */
+ if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+ break;
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- read_lock(&ei->i_es_lock);
- if (ei->i_es_lru_nr == 0) {
- read_unlock(&ei->i_es_lock);
+ /* Skip the inode that is newer than the last_sorted time */
+ if (sbi->s_es_last_sorted < ei->i_touch_when) {
+ list_move_tail(cur, &skiped);
continue;
}
- read_unlock(&ei->i_es_lock);
+
+ if (ei->i_es_lru_nr == 0)
+ continue;
write_lock(&ei->i_es_lock);
ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ if (ei->i_es_lru_nr == 0)
+ list_del_init(&ei->i_es_lru);
write_unlock(&ei->i_es_lock);
nr_shrunk += ret;
@@ -917,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
if (nr_to_scan == 0)
break;
}
- list_splice_tail(&scanned, &sbi->s_es_lru);
+
+ /* Move the newer inodes into the tail of the LRU list. */
+ list_splice_tail(&skiped, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -925,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
return ret;
}
-void ext4_es_register_shrinker(struct super_block *sb)
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
- struct ext4_sb_info *sbi;
-
- sbi = EXT4_SB(sb);
INIT_LIST_HEAD(&sbi->s_es_lru);
spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_last_sorted = 0;
sbi->s_es_shrinker.shrink = ext4_es_shrink;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&sbi->s_es_shrinker);
}
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
- unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+ unregister_shrinker(&sbi->s_es_shrinker);
}
void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ei->i_touch_when = jiffies;
+
+ if (!list_empty(&ei->i_es_lru))
+ return;
+
spin_lock(&sbi->s_es_lru_lock);
if (list_empty(&ei->i_es_lru))
list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
- else
- list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
spin_unlock(&sbi->s_es_lru_lock);
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE)
+struct ext4_sb_info;
struct ext4_extent;
struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
es->es_pblk = block;
}
-extern void ext4_es_register_shrinker(struct super_block *sb);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 469361dbe619..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start)
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
break;
}
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
}
last++;
- dataoff = last << blkbits;
+ dataoff = (loff_t)last << blkbits;
} while (last <= end);
mutex_unlock(&inode->i_mutex);
@@ -530,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -541,7 +541,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
last = es.es_lblk + es.es_len;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
@@ -556,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
&map, &holeoff);
if (!unwritten) {
last += ret;
- holeoff = last << blkbits;
+ holeoff = (loff_t)last << blkbits;
continue;
}
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode: inode to sync
- * @datasync: only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking. This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly. The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
- int err;
- int ret;
-
- ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY))
- return ret;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- return ret;
-
- err = sync_inode_metadata(inode, 1);
- if (ret == 0)
- ret = err;
- return ret;
-}
-
/*
* akpm: A new design for ext4_sync_file().
*
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret, err;
+ int ret = 0, err;
tid_t commit_tid;
bool needs_barrier = false;
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext4_sync_file_enter(file, datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- if (inode->i_sb->s_flags & MS_RDONLY)
- goto out;
-
- ret = ext4_flush_unwritten_io(inode);
- if (ret < 0)
+ if (inode->i_sb->s_flags & MS_RDONLY) {
+ /* Make sure that we read updated s_mount_flags value */
+ smp_rmb();
+ if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ret = -EROFS;
goto out;
+ }
if (!journal) {
- ret = __sync_inode(inode, datasync);
+ ret = generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
}
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!ret)
ret = err;
}
- out:
- mutex_unlock(&inode->i_mutex);
+out:
trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..f03598c6ffd3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -747,7 +747,8 @@ repeat_in_this_group:
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks);
+ handle_type, nblocks,
+ 0);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
partial--;
}
out:
- trace_ext4_ind_map_blocks_exit(inode, map, err);
+ trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
return err;
}
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
- if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
- mutex_lock(&inode->i_mutex);
- ext4_flush_unwritten_io(inode);
- mutex_unlock(&inode->i_mutex);
- }
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
{
- int indirects;
-
- /* if nrblocks are contiguous */
- if (chunk) {
- /*
- * With N contiguous data blocks, we need at most
- * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
- * 2 dindirect blocks, and 1 tindirect block
- */
- return DIV_ROUND_UP(nrblocks,
- EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
- }
/*
- * if nrblocks are not contiguous, worse case, each block touch
- * a indirect block, and each indirect block touch a double indirect
- * block, plus a triple indirect block
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
*/
- indirects = nrblocks * 2 + 1;
- return indirects;
+ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
- int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+ int flags = EXT4_FREE_BLOCKS_VALIDATED;
int err;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1a346a6bdc8f..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
entry = (struct ext4_xattr_entry *)
((void *)raw_inode + EXT4_I(inode)->i_inline_off);
- free += le32_to_cpu(entry->e_value_size);
+ free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
goto out;
}
@@ -1810,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
if (error)
goto out;
- physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..0188e65e1f58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
/*
* Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
filemap_write_and_wait(&inode->i_data);
}
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
- ext4_ioend_shutdown(inode);
+ WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
goto no_delete;
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
- unsigned int max_pages)
-{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index;
- struct pagevec pvec;
- pgoff_t num = 0;
- int i, nr_pages, done = 0;
-
- if (max_pages == 0)
- return 0;
- pagevec_init(&pvec, 0);
- while (!done) {
- index = idx;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- (pgoff_t)PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- lock_page(page);
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page) ||
- PageWriteback(page) ||
- page->index != idx) {
- done = 1;
- unlock_page(page);
- break;
- }
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- if (!buffer_delay(bh) &&
- !buffer_unwritten(bh))
- done = 1;
- bh = bh->b_this_page;
- } while (!done && (bh != head));
- }
- unlock_page(page);
- if (done)
- break;
- idx++;
- num++;
- if (num >= max_pages) {
- done = 1;
- break;
- }
- }
- pagevec_release(&pvec);
- }
- return num;
-}
-
#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
struct inode *inode,
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+ ext4_es_lru_add(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file,
}
}
- if (ext4_has_inline_data(inode))
- copied = ext4_write_inline_data_end(inode, pos, len,
- copied, page);
- else
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ if (ret < 0)
+ goto errout;
+ copied = ret;
+ } else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file,
if (i_size_changed)
ext4_mark_inode_dirty(handle, inode);
- if (copied < 0)
- ret = copied;
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
}
static void ext4_da_page_release_reservation(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
int to_release = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int stop = offset + length;
int num_clusters;
ext4_fsblk_t lblk;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
+ if (next_off > stop)
+ break;
+
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
@@ -1460,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
- struct ext4_map_blocks *map)
-{
- struct pagevec pvec;
- unsigned long index, end;
- int ret = 0, err, nr_pages, i;
- struct inode *inode = mpd->inode;
- struct address_space *mapping = inode->i_mapping;
- loff_t size = i_size_read(inode);
- unsigned int len, block_start;
- struct buffer_head *bh, *page_bufs = NULL;
- sector_t pblock = 0, cur_logical = 0;
- struct ext4_io_submit io_submit;
+struct mpage_da_data {
+ struct inode *inode;
+ struct writeback_control *wbc;
- BUG_ON(mpd->next_page <= mpd->first_page);
- memset(&io_submit, 0, sizeof(io_submit));
+ pgoff_t first_page; /* The first page to write */
+ pgoff_t next_page; /* Current page to examine */
+ pgoff_t last_page; /* Last page to examine */
/*
- * We need to start from the first_page to the next_page - 1
- * to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->b_blocknr we would only be looking
- * at the currently mapped buffer_heads.
+ * Extent to map - this can be after first_page because that can be
+ * fully mapped. We somewhat abuse m_flags to store whether the extent
+ * is delalloc or unwritten.
*/
- index = mpd->first_page;
- end = mpd->next_page - 1;
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- int skip_page = 0;
- struct page *page = pvec.pages[i];
-
- index = page->index;
- if (index > end)
- break;
-
- if (index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- if (map) {
- cur_logical = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- pblock = map->m_pblk + (cur_logical -
- map->m_lblk);
- }
- index++;
-
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
-
- bh = page_bufs = page_buffers(page);
- block_start = 0;
- do {
- if (map && (cur_logical >= map->m_lblk) &&
- (cur_logical <= (map->m_lblk +
- (map->m_len - 1)))) {
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock;
- }
- if (buffer_unwritten(bh) ||
- buffer_mapped(bh))
- BUG_ON(bh->b_blocknr != pblock);
- if (map->m_flags & EXT4_MAP_UNINIT)
- set_buffer_uninit(bh);
- clear_buffer_unwritten(bh);
- }
-
- /*
- * skip page if block allocation undone and
- * block is dirty
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh))
- skip_page = 1;
- bh = bh->b_this_page;
- block_start += bh->b_size;
- cur_logical++;
- pblock++;
- } while (bh != page_bufs);
-
- if (skip_page) {
- unlock_page(page);
- continue;
- }
-
- clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&io_submit, page, len,
- mpd->wbc);
- if (!err)
- mpd->pages_written++;
- /*
- * In error case, we have to continue because
- * remaining pages are still locked
- */
- if (ret == 0)
- ret = err;
- }
- pagevec_release(&pvec);
- }
- ext4_io_submit(&io_submit);
- return ret;
-}
+ struct ext4_map_blocks map;
+ struct ext4_io_submit io_submit; /* IO submission data */
+};
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ bool invalidate)
{
int nr_pages, i;
pgoff_t index, end;
struct pagevec pvec;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t start, last;
+
+ /* This is necessary when next_page == 0. */
+ if (mpd->first_page >= mpd->next_page)
+ return;
index = mpd->first_page;
end = mpd->next_page - 1;
-
- start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ if (invalidate) {
+ ext4_lblk_t start, last;
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+ }
pagevec_init(&pvec, 0);
while (index <= end) {
@@ -1606,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
break;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- block_invalidatepage(page, 0);
- ClearPageUptodate(page);
+ if (invalidate) {
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ ClearPageUptodate(page);
+ }
unlock_page(page);
}
index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
- return;
}
static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
-/*
- * mpage_da_map_and_submit - go through given space, map them
- * if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
- int err, blks, get_blocks_flags;
- struct ext4_map_blocks map, *mapp = NULL;
- sector_t next = mpd->b_blocknr;
- unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
- loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
- handle_t *handle = NULL;
-
- /*
- * If the blocks are mapped already, or we couldn't accumulate
- * any blocks, then proceed immediately to the submission stage.
- */
- if ((mpd->b_size == 0) ||
- ((mpd->b_state & (1 << BH_Mapped)) &&
- !(mpd->b_state & (1 << BH_Delay)) &&
- !(mpd->b_state & (1 << BH_Unwritten))))
- goto submit_io;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
-
- /*
- * Call ext4_map_blocks() to allocate any delayed allocation
- * blocks, or to convert an uninitialized extent to be
- * initialized (in the case where we have written into
- * one or more preallocated blocks).
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
- * indicate that we are on the delayed allocation path. This
- * affects functions in many different parts of the allocation
- * call path. This flag exists primarily because we don't
- * want to change *many* call functions, so ext4_map_blocks()
- * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
- * inode's allocation semaphore is taken.
- *
- * If the blocks in questions were delalloc blocks, set
- * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
- * variables are updated after the blocks have been allocated.
- */
- map.m_lblk = next;
- map.m_len = max_blocks;
- /*
- * We're in delalloc path and it is possible that we're going to
- * need more metadata blocks than previously reserved. However
- * we must not fail because we're in writeback and there is
- * nothing we can do about it so it might result in data loss.
- * So use reserved blocks to allocate metadata if possible.
- */
- get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_METADATA_NOFAIL;
- if (ext4_should_dioread_nolock(mpd->inode))
- get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
-
- blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
- if (blks < 0) {
- struct super_block *sb = mpd->inode->i_sb;
-
- err = blks;
- /*
- * If get block returns EAGAIN or ENOSPC and there
- * appears to be free blocks we will just let
- * mpage_da_submit_io() unlock all of the pages.
- */
- if (err == -EAGAIN)
- goto submit_io;
-
- if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
- mpd->retval = err;
- goto submit_io;
- }
-
- /*
- * get block failure will cause us to loop in
- * writepages, because a_ops->writepage won't be able
- * to make progress. The page will be redirtied by
- * writepage and writepages will again try to write
- * the same.
- */
- if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
- ext4_msg(sb, KERN_CRIT,
- "delayed block allocation failed for inode %lu "
- "at logical offset %llu with max blocks %zd "
- "with error %d", mpd->inode->i_ino,
- (unsigned long long) next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- ext4_msg(sb, KERN_CRIT,
- "This should not happen!! Data will be lost");
- if (err == -ENOSPC)
- ext4_print_free_blocks(mpd->inode);
- }
- /* invalidate all the pages */
- ext4_da_block_invalidatepages(mpd);
-
- /* Mark this page range as having been completed */
- mpd->io_done = 1;
- return;
- }
- BUG_ON(blks == 0);
-
- mapp = &map;
- if (map.m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = mpd->inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map.m_len; i++)
- unmap_underlying_metadata(bdev, map.m_pblk + i);
- }
-
- /*
- * Update on-disk size along with block allocation.
- */
- disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
- if (disksize > i_size_read(mpd->inode))
- disksize = i_size_read(mpd->inode);
- if (disksize > EXT4_I(mpd->inode)->i_disksize) {
- ext4_update_i_disksize(mpd->inode, disksize);
- err = ext4_mark_inode_dirty(handle, mpd->inode);
- if (err)
- ext4_error(mpd->inode->i_sb,
- "Failed to mark inode %lu dirty",
- mpd->inode->i_ino);
- }
-
-submit_io:
- mpage_da_submit_io(mpd, mapp);
- mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
- (1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @b_state - b_state of the buffer head added
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
- unsigned long b_state)
-{
- sector_t next;
- int blkbits = mpd->inode->i_blkbits;
- int nrblocks = mpd->b_size >> blkbits;
-
- /*
- * XXX Don't go larger than mballoc is willing to allocate
- * This is a stopgap solution. We eventually need to fold
- * mpage_da_submit_io() into this function and then call
- * ext4_map_blocks() multiple times in a loop
- */
- if (nrblocks >= (8*1024*1024 >> blkbits))
- goto flush_it;
-
- /* check if the reserved journal credits might overflow */
- if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
- if (nrblocks >= EXT4_MAX_TRANS_DATA) {
- /*
- * With non-extent format we are limited by the journal
- * credit available. Total credit needed to insert
- * nrblocks contiguous blocks is dependent on the
- * nrblocks. So limit nrblocks.
- */
- goto flush_it;
- }
- }
- /*
- * First block in the extent
- */
- if (mpd->b_size == 0) {
- mpd->b_blocknr = logical;
- mpd->b_size = 1 << blkbits;
- mpd->b_state = b_state & BH_FLAGS;
- return;
- }
-
- next = mpd->b_blocknr + nrblocks;
- /*
- * Can we merge the block to our big extent?
- */
- if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += 1 << blkbits;
- return;
- }
-
-flush_it:
- /*
- * We couldn't merge the block to our extent, so we
- * need to flush current extent and start new one
- */
- mpage_da_map_and_submit(mpd);
- return;
-}
-
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
{
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1883,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
+ ext4_es_lru_add(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, &es)) {
@@ -2156,7 +1804,7 @@ out:
* lock so we have to do some magic.
*
* This function can get called via...
- * - ext4_da_writepages after taking page lock (have journal handle)
+ * - ext4_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
* - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1882,405 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- memset(&io_submit, 0, sizeof(io_submit));
+ ext4_io_submit_init(&io_submit, wbc);
+ io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_submit.io_end) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
ret = ext4_bio_write_page(&io_submit, page, len, wbc);
ext4_io_submit(&io_submit);
+ /* Drop io_end reference we got from init */
+ ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
/*
- * This is called via ext4_da_writepages() to
- * calculate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
*/
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ unsigned long b_state)
+{
+ struct ext4_map_blocks *map = &mpd->map;
+
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return 0;
+
+ /* First block in the extent? */
+ if (map->m_len == 0) {
+ map->m_lblk = lblk;
+ map->m_len = 1;
+ map->m_flags = b_state & BH_FLAGS;
+ return 1;
+ }
+
+ /* Can we merge the block to our big extent? */
+ if (lblk == map->m_lblk + map->m_len &&
+ (b_state & BH_FLAGS) == map->m_flags) {
+ map->m_len++;
+ return 1;
+ }
+ return 0;
+}
+
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct inode *inode = mpd->inode;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+
+ do {
+ BUG_ON(buffer_locked(bh));
+
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+ lblk >= blocks) {
+ /* Found extent to map? */
+ if (mpd->map.m_len)
+ return false;
+ if (lblk >= blocks)
+ return true;
+ continue;
+ }
+ if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+ return false;
+ } while (lblk++, (bh = bh->b_this_page) != head);
+ return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
{
- int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+ return err;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ * submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+ struct pagevec pvec;
+ int nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *head, *bh;
+ int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+ >> inode->i_blkbits;
+ pgoff_t start, end;
+ ext4_lblk_t lblk;
+ sector_t pblock;
+ int err;
+
+ start = mpd->map.m_lblk >> bpp_bits;
+ end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+ lblk = start << bpp_bits;
+ pblock = mpd->map.m_pblk;
+
+ pagevec_init(&pvec, 0);
+ while (start <= end) {
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+ PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end)
+ break;
+ /* Upto 'end' pages must be contiguous */
+ BUG_ON(page->index != start);
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ add_page_bufs_to_extent(mpd, head, bh,
+ lblk);
+ pagevec_release(&pvec);
+ return 0;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ } while (++lblk < blocks &&
+ (bh = bh->b_this_page) != head);
+
+ /*
+ * FIXME: This is going to break if dioread_nolock
+ * supports blocksize < pagesize as we will try to
+ * convert potentially unmapped parts of inode.
+ */
+ mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ /* Page fully mapped - let IO run! */
+ err = mpage_submit_page(mpd, page);
+ if (err < 0) {
+ pagevec_release(&pvec);
+ return err;
+ }
+ start++;
+ }
+ pagevec_release(&pvec);
+ }
+ /* Extent fully mapped and matches with page boundary. We are done. */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int get_blocks_flags;
+ int err;
+
+ trace_ext4_da_write_pages_extent(inode, map);
/*
- * With non-extent format the journal credit needed to
- * insert nrblocks contiguous block is dependent on
- * number of contiguous block. So we will limit
- * number of contiguous block to a sane value
+ * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+ * to convert an uninitialized extent to be initialized (in the case
+ * where we have written into one or more preallocated blocks). It is
+ * possible that we're going to need more metadata blocks than
+ * previously reserved. However we must not fail because we're in
+ * writeback and there is nothing we can do about it so it might result
+ * in data loss. So use reserved blocks to allocate metadata if
+ * possible.
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+ * in question are delalloc blocks. This affects functions in many
+ * different parts of the allocation call path. This flag exists
+ * primarily because we don't want to change *many* call functions, so
+ * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+ * once the inode's allocation semaphore is taken.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- (max_blocks > EXT4_MAX_TRANS_DATA))
- max_blocks = EXT4_MAX_TRANS_DATA;
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+ if (ext4_should_dioread_nolock(inode))
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (map->m_flags & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
- return ext4_chunk_trans_blocks(inode, max_blocks);
+ err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+ if (err < 0)
+ return err;
+ if (map->m_flags & EXT4_MAP_UNINIT) {
+ if (!mpd->io_submit.io_end->handle &&
+ ext4_handle_valid(handle)) {
+ mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+ handle->h_rsv_handle = NULL;
+ }
+ ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ }
+
+ BUG_ON(map->m_len == 0);
+ if (map->m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int i;
+
+ for (i = 0; i < map->m_len; i++)
+ unmap_underlying_metadata(bdev, map->m_pblk + i);
+ }
+ return 0;
}
/*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ * mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
*/
-static int write_cache_pages_da(handle_t *handle,
- struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd,
- pgoff_t *done_index)
+static int mpage_map_and_submit_extent(handle_t *handle,
+ struct mpage_da_data *mpd,
+ bool *give_up_on_write)
{
- struct buffer_head *bh, *head;
- struct inode *inode = mapping->host;
- struct pagevec pvec;
- unsigned int nr_pages;
- sector_t logical;
- pgoff_t index, end;
- long nr_to_write = wbc->nr_to_write;
- int i, tag, ret = 0;
-
- memset(mpd, 0, sizeof(struct mpage_da_data));
- mpd->wbc = wbc;
- mpd->inode = inode;
- pagevec_init(&pvec, 0);
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ struct inode *inode = mpd->inode;
+ struct ext4_map_blocks *map = &mpd->map;
+ int err;
+ loff_t disksize;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ mpd->io_submit.io_end->offset =
+ ((loff_t)map->m_lblk) << inode->i_blkbits;
+ while (map->m_len) {
+ err = mpage_map_one_extent(handle, mpd);
+ if (err < 0) {
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ goto invalidate_dirty_pages;
+ /*
+ * Let the uper layers retry transient errors.
+ * In the case of ENOSPC, if ext4_count_free_blocks()
+ * is non-zero, a commit should free up blocks.
+ */
+ if ((err == -ENOMEM) ||
+ (err == -ENOSPC && ext4_count_free_clusters(sb)))
+ return err;
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with"
+ " max blocks %u with error %d",
+ inode->i_ino,
+ (unsigned long long)map->m_lblk,
+ (unsigned)map->m_len, -err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ invalidate_dirty_pages:
+ *give_up_on_write = true;
+ return err;
+ }
+ /*
+ * Update buffer state, submit mapped pages, and get us new
+ * extent to map
+ */
+ err = mpage_map_and_submit_buffers(mpd);
+ if (err < 0)
+ return err;
+ }
+
+ /* Update on-disk size after IO is submitted */
+ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ int err2;
+
+ ext4_update_i_disksize(inode, disksize);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (err2)
+ ext4_error(inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ if (!err)
+ err = err2;
+ }
+ return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+
+ return ext4_meta_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct pagevec pvec;
+ unsigned int nr_pages;
+ pgoff_t index = mpd->first_page;
+ pgoff_t end = mpd->last_page;
+ int tag;
+ int i, err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct buffer_head *head;
+
+ if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
- *done_index = index;
+ pagevec_init(&pvec, 0);
+ mpd->map.m_len = 0;
+ mpd->next_page = index;
while (index <= end) {
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
- return 0;
+ goto out;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2318,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
if (page->index > end)
goto out;
- *done_index = page->index + 1;
-
- /*
- * If we can't merge this page, and we have
- * accumulated an contiguous region, write it
- */
- if ((mpd->next_page != page->index) &&
- (mpd->next_page != mpd->first_page)) {
- mpage_da_map_and_submit(mpd);
- goto ret_extent_tail;
- }
+ /* If we can't merge this page, we are done. */
+ if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ goto out;
lock_page(page);
-
/*
- * If the page is no longer dirty, or its
- * mapping no longer corresponds to inode we
- * are writing (which means it has been
- * truncated or invalidated), or the page is
- * already under writeback and we are not
- * doing a data integrity writeback, skip the page
+ * If the page is no longer dirty, or its mapping no
+ * longer corresponds to inode we are writing (which
+ * means it has been truncated or invalidated), or the
+ * page is already under writeback and we are not doing
+ * a data integrity writeback, skip the page
*/
if (!PageDirty(page) ||
(PageWriteback(page) &&
- (wbc->sync_mode == WB_SYNC_NONE)) ||
+ (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
@@ -2351,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
- /*
- * If we have inline data and arrive here, it means that
- * we will soon create the block for the 1st page, so
- * we'd better clear the inline data here.
- */
- if (ext4_has_inline_data(inode)) {
- BUG_ON(ext4_test_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA));
- ext4_destroy_inline_data(handle, inode);
- }
-
- if (mpd->next_page != page->index)
+ if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
- logical = (sector_t) page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
/* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_CACHE_SHIFT - blkbits);
head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
- /*
- * We need to try to allocate unmapped blocks
- * in the same page. Otherwise we won't make
- * progress with the page in ext4_writepage
- */
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_state);
- if (mpd->io_done)
- goto ret_extent_tail;
- } else if (buffer_dirty(bh) &&
- buffer_mapped(bh)) {
- /*
- * mapped dirty buffer. We need to
- * update the b_state because we look
- * at b_state in mpage_da_map_blocks.
- * We don't update b_size because if we
- * find an unmapped buffer_head later
- * we need to use the b_state flag of
- * that buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state =
- bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
-
- if (nr_to_write > 0) {
- nr_to_write--;
- if (nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE)
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
+ if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+ goto out;
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, page);
+ if (err < 0)
goto out;
}
+
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+ mpd->next_page - mpd->first_page >=
+ mpd->wbc->nr_to_write)
+ goto out;
}
pagevec_release(&pvec);
cond_resched();
}
return 0;
-ret_extent_tail:
- ret = MPAGE_DA_EXTENT_TAIL;
out:
pagevec_release(&pvec);
- cond_resched();
- return ret;
+ return err;
}
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = ext4_writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
-static int ext4_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- pgoff_t index;
+ pgoff_t writeback_index = 0;
+ long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
+ int cycled = 1;
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int pages_written = 0;
- unsigned int max_pages;
- int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0;
- long desired_nr_to_write, nr_to_writebump = 0;
- loff_t range_start = wbc->range_start;
+ int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- pgoff_t done_index = 0;
- pgoff_t end;
+ bool done;
struct blk_plug plug;
+ bool give_up_on_write = false;
- trace_ext4_da_writepages(inode, wbc);
+ trace_ext4_writepages(inode, wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping,
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
+ if (ext4_should_journal_data(inode)) {
+ struct blk_plug plug;
+ int ret;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
+ }
+
/*
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
* EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
* the latter could be true if the filesystem is mounted
- * read-only, and in that case, ext4_da_writepages should
+ * read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert upto one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ }
+
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ /* Just inode will be modified... */
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- range_cyclic = wbc->range_cyclic;
if (wbc->range_cyclic) {
- index = mapping->writeback_index;
- if (index)
+ writeback_index = mapping->writeback_index;
+ if (writeback_index)
cycled = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = LLONG_MAX;
- wbc->range_cyclic = 0;
- end = -1;
+ mpd.first_page = writeback_index;
+ mpd.last_page = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- }
-
- /*
- * This works around two forms of stupidity. The first is in
- * the writeback code, which caps the maximum number of pages
- * written to be 1024 pages. This is wrong on multiple
- * levels; different architectues have a different page size,
- * which changes the maximum amount of data which gets
- * written. Secondly, 4 megabytes is way too small. XFS
- * forces this value to be 16 megabytes by multiplying
- * nr_to_write parameter by four, and then relies on its
- * allocator to allocate larger extents to make them
- * contiguous. Unfortunately this brings us to the second
- * stupidity, which is that ext4's mballoc code only allocates
- * at most 2048 blocks. So we force contiguous writes up to
- * the number of dirty blocks in the inode, or
- * sbi->max_writeback_mb_bump whichever is smaller.
- */
- max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
- if (!range_cyclic && range_whole) {
- if (wbc->nr_to_write == LONG_MAX)
- desired_nr_to_write = wbc->nr_to_write;
- else
- desired_nr_to_write = wbc->nr_to_write * 8;
- } else
- desired_nr_to_write = ext4_num_dirty_pages(inode, index,
- max_pages);
- if (desired_nr_to_write > max_pages)
- desired_nr_to_write = max_pages;
-
- if (wbc->nr_to_write < desired_nr_to_write) {
- nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
- wbc->nr_to_write = desired_nr_to_write;
+ mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
}
+ mpd.inode = inode;
+ mpd.wbc = wbc;
+ ext4_io_submit_init(&mpd.io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
-
+ tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ done = false;
blk_start_plug(&plug);
- while (!ret && wbc->nr_to_write > 0) {
+ while (!done && mpd.first_page <= mpd.last_page) {
+ /* For each extent of pages we use new io_end */
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ break;
+ }
/*
- * we insert one extent at a time. So we need
- * credit needed for single extent allocation.
- * journalled mode is currently not supported
- * by delalloc
+ * We have two constraints: We find one extent to map and we
+ * must always write out whole page (makes a difference when
+ * blocksize < pagesize) so that we don't block on IO when we
+ * try to write out the rest of the page. Journalled mode is
+ * not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
needed_blocks = ext4_da_writepages_trans_blocks(inode);
- /* start a new transaction*/
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- needed_blocks);
+ /* start a new transaction */
+ handle = ext4_journal_start_with_reserve(inode,
+ EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- blk_finish_plug(&plug);
- goto out_writepages;
+ /* Release allocated io_end */
+ ext4_put_io_end(mpd.io_submit.io_end);
+ break;
}
- /*
- * Now call write_cache_pages_da() to find the next
- * contiguous region of logical blocks that need
- * blocks to be allocated by ext4 and submit them.
- */
- ret = write_cache_pages_da(handle, mapping,
- wbc, &mpd, &done_index);
- /*
- * If we have a contiguous extent of pages and we
- * haven't done the I/O yet, map the blocks and submit
- * them for I/O.
- */
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- mpage_da_map_and_submit(&mpd);
- ret = MPAGE_DA_EXTENT_TAIL;
+ trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+ ret = mpage_prepare_extent_to_map(&mpd);
+ if (!ret) {
+ if (mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
+ &give_up_on_write);
+ else {
+ /*
+ * We scanned the whole range (or exhausted
+ * nr_to_write), submitted what was mapped and
+ * didn't find anything needing mapping. We are
+ * done.
+ */
+ done = true;
+ }
}
- trace_ext4_da_write_pages(inode, &mpd);
- wbc->nr_to_write -= mpd.pages_written;
-
ext4_journal_stop(handle);
-
- if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
- /* commit the transaction which would
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Drop our io_end reference we got from init */
+ ext4_put_io_end(mpd.io_submit.io_end);
+
+ if (ret == -ENOSPC && sbi->s_journal) {
+ /*
+ * Commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
ret = 0;
- } else if (ret == MPAGE_DA_EXTENT_TAIL) {
- /*
- * Got one extent now try with rest of the pages.
- * If mpd.retval is set -EIO, journal is aborted.
- * So we don't need to write any more.
- */
- pages_written += mpd.pages_written;
- ret = mpd.retval;
- io_done = 1;
- } else if (wbc->nr_to_write)
- /*
- * There is no more writeout needed
- * or we requested for a noblocking writeout
- * and we found the device congested
- */
+ continue;
+ }
+ /* Fatal error - ENOMEM, EIO... */
+ if (ret)
break;
}
blk_finish_plug(&plug);
- if (!io_done && !cycled) {
+ if (!ret && !cycled) {
cycled = 1;
- index = 0;
- wbc->range_start = index << PAGE_CACHE_SHIFT;
- wbc->range_end = mapping->writeback_index - 1;
+ mpd.last_page = writeback_index - 1;
+ mpd.first_page = 0;
goto retry;
}
/* Update index */
- wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
- * set the writeback_index so that range_cyclic
+ * Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = done_index;
+ mapping->writeback_index = mpd.first_page;
out_writepages:
- wbc->nr_to_write -= nr_to_writebump;
- wbc->range_start = range_start;
- trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ trace_ext4_writepages_result(inode, wbc, ret,
+ nr_to_write - wbc->nr_to_write);
return ret;
}
@@ -2829,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
/*
* Drop reserved blocks
@@ -2838,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
if (!page_has_buffers(page))
goto out;
- ext4_da_page_release_reservation(page, offset);
+ ext4_da_page_release_reservation(page, offset, length);
out:
- ext4_invalidatepage(page, offset);
+ ext4_invalidatepage(page, offset, length);
return;
}
@@ -2864,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* laptop_mode, not even desirable). However, to do otherwise
* would require replicating code paths in:
*
- * ext4_da_writepages() ->
+ * ext4_writepages() ->
* write_cache_pages() ---> (via passed in callback function)
* __mpage_da_writepage() -->
* mpage_add_bh_to_extent()
@@ -2989,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- trace_ext4_invalidatepage(page, offset);
+ trace_ext4_invalidatepage(page, offset, length);
/* No journalling happens on data buffers when this function is used */
WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
- block_invalidatepage(page, offset);
+ block_invalidatepage(page, offset, length);
}
static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset);
+ trace_ext4_journalled_invalidatepage(page, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0)
+ if (offset == 0 && length == PAGE_CACHE_SIZE)
ClearPageChecked(page);
- return jbd2_journal_invalidatepage(journal, page, offset);
+ return jbd2_journal_invalidatepage(journal, page, offset, length);
}
/* Wrapper for aops... */
static void ext4_journalled_invalidatepage(struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+ WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
- /* if not async direct IO or dio with 0 bytes write, just return */
- if (!io_end || !size)
- goto out;
+ /* if not async direct IO just return */
+ if (!io_end) {
+ inode_dio_done(inode);
+ if (is_async)
+ aio_complete(iocb, ret, 0);
+ return;
+ }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
size);
iocb->private = NULL;
-
- /* if not aio dio with unwritten extents, just free io and return */
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
-out:
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
- return;
- }
-
io_end->offset = offset;
io_end->size = size;
if (is_async) {
io_end->iocb = iocb;
io_end->result = ret;
}
-
- ext4_add_complete_io(io_end);
+ ext4_put_io_end_defer(io_end);
}
/*
@@ -3129,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
+ ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
BUG_ON(iocb->private == NULL);
+ /*
+ * Make all waiters for direct IO properly wait also for extent
+ * conversion. This also disallows race between truncate() and
+ * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ */
+ if (rw == WRITE)
+ atomic_inc(&inode->i_dio_count);
+
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
if (overwrite) {
- atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
@@ -3167,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
iocb->private = NULL;
ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
- ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end) {
ret = -ENOMEM;
goto retake_lock;
}
io_end->flag |= EXT4_IO_END_DIRECT;
- iocb->private = io_end;
+ /*
+ * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
/*
* we save the io structure for current async direct
* IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
dio_flags);
- if (iocb->private)
- ext4_inode_aio_set(inode, NULL);
/*
- * The io_end structure takes a reference to the inode, that
- * structure needs to be destroyed and the reference to the
- * inode need to be dropped, when IO is complete, even with 0
- * byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will
- * be destroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since VFS
- * direct IO won't invoke the end_io call back function, we
- * need to free the end_io structure here.
+ * Put our reference to io_end. This can free the io_end structure e.g.
+ * in sync IO case or in case of error. It can even perform extent
+ * conversion if all bios we submitted finished before we got here.
+ * Note that in that case iocb->private can be already set to NULL
+ * here.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
- iocb->private = NULL;
- } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ if (io_end) {
+ ext4_inode_aio_set(inode, NULL);
+ ext4_put_io_end(io_end);
+ /*
+ * When no IO was submitted ext4_end_io_dio() was not
+ * called so we have to put iocb's reference.
+ */
+ if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+ WARN_ON(iocb->private != io_end);
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->iocb);
+ /*
+ * Generic code already did inode_dio_done() so we
+ * have to clear EXT4_IO_END_DIRECT to not do it for
+ * the second time.
+ */
+ io_end->flag = 0;
+ ext4_put_io_end(io_end);
+ iocb->private = NULL;
+ }
+ }
+ if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
* for non AIO case, since the IO is already
* completed, we could do the conversion right here
*/
- err = ext4_convert_unwritten_extents(inode,
+ err = ext4_convert_unwritten_extents(NULL, inode,
offset, ret);
if (err < 0)
ret = err;
@@ -3231,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
}
retake_lock:
+ if (rw == WRITE)
+ inode_dio_done(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
- inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
@@ -3292,6 +3241,7 @@ static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
.bmap = ext4_bmap,
@@ -3307,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
+ .writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
.set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3322,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
- .writepages = ext4_da_writepages,
+ .writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
.bmap = ext4_bmap,
@@ -3355,89 +3306,56 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
-
/*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_discard_partial_page_buffers(handle_t *handle,
- struct address_space *mapping, loff_t from,
- loff_t length, int flags)
+int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
struct inode *inode = mapping->host;
- struct page *page;
- int err = 0;
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (!page)
- return -ENOMEM;
-
- err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
- from, length, flags);
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
- unlock_page(page);
- page_cache_release(page);
- return err;
+ return ext4_block_zero_page_range(handle, mapping, from, length);
}
/*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped. Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been locked. The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'. This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode: The files inode
- * page: A locked page that contains the offset "from"
- * from: The starting byte offset (from the beginning of the file)
- * to begin discarding
- * len: The length of bytes to discard
- * flags: Optional flags that may be used:
- *
- * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- * Only zero the regions of the page whose buffer heads
- * have already been unmapped. This flag is appropriate
- * for updating the contents of a page whose blocks may
- * have already been released, and we only want to zero
- * out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
*/
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
- struct inode *inode, struct page *page, loff_t from,
- loff_t length, int flags)
+int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned int offset = from & (PAGE_CACHE_SIZE-1);
- unsigned int blocksize, max, pos;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
+ struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
- blocksize = inode->i_sb->s_blocksize;
- max = PAGE_CACHE_SIZE - offset;
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
- if (index != page->index)
- return -EINVAL;
+ blocksize = inode->i_sb->s_blocksize;
+ max = blocksize - (offset & (blocksize - 1));
/*
* correct length if it does not fall between
- * 'from' and the end of the page
+ * 'from' and the end of the block
*/
if (length > max || length < 0)
length = max;
@@ -3455,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
iblock++;
pos += blocksize;
}
-
- pos = offset;
- while (pos < offset + length) {
- unsigned int end_of_block, range_to_discard;
-
- err = 0;
-
- /* The length of space left to zero and unmap */
- range_to_discard = offset + length - pos;
-
- /* The length of space until the end of the block */
- end_of_block = blocksize - (pos & (blocksize-1));
-
- /*
- * Do not unmap or zero past end of block
- * for this buffer head
- */
- if (range_to_discard > end_of_block)
- range_to_discard = end_of_block;
-
-
- /*
- * Skip this buffer head if we are only zeroing unampped
- * regions of the page
- */
- if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
- buffer_mapped(bh))
- goto next;
-
- /* If the range is block aligned, unmap */
- if (range_to_discard == blocksize) {
- clear_buffer_dirty(bh);
- bh->b_bdev = NULL;
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
- clear_buffer_uptodate(bh);
- zero_user(page, pos, range_to_discard);
- BUFFER_TRACE(bh, "Buffer discarded");
- goto next;
- }
-
- /*
- * If this block is not completely contained in the range
- * to be discarded, then it is not going to be released. Because
- * we need to keep this block, we need to make sure this part
- * of the page is uptodate before we modify it by writeing
- * partial zeros on it.
- */
+ if (buffer_freed(bh)) {
+ BUFFER_TRACE(bh, "freed: skip");
+ goto unlock;
+ }
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "unmapped");
+ ext4_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
if (!buffer_mapped(bh)) {
- /*
- * Buffer head must be mapped before we can read
- * from the block
- */
- BUFFER_TRACE(bh, "unmapped");
- ext4_get_block(inode, iblock, bh, 0);
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "still unmapped");
- goto next;
- }
+ BUFFER_TRACE(bh, "still unmapped");
+ goto unlock;
}
+ }
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
- if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt.*/
- if (!buffer_uptodate(bh))
- goto next;
- }
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+ if (ext4_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto unlock;
+ }
+ zero_user(page, offset, length);
+ BUFFER_TRACE(bh, "zeroed end of block");
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (err)
- goto next;
- }
+ if (ext4_should_journal_data(inode)) {
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ } else {
+ err = 0;
+ mark_buffer_dirty(bh);
+ if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+ err = ext4_jbd2_file_inode(handle, inode);
+ }
+
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
- zero_user(page, pos, range_to_discard);
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t length)
+{
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned partial_start, partial_end;
+ ext4_fsblk_t start, end;
+ loff_t byte_end = (lstart + length - 1);
+ int err = 0;
- err = 0;
- if (ext4_should_journal_data(inode)) {
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- } else
- mark_buffer_dirty(bh);
+ partial_start = lstart & (sb->s_blocksize - 1);
+ partial_end = byte_end & (sb->s_blocksize - 1);
- BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
- bh = bh->b_this_page;
- iblock++;
- pos += range_to_discard;
- }
+ start = lstart >> sb->s_blocksize_bits;
+ end = byte_end >> sb->s_blocksize_bits;
+ /* Handle partial zero within the single block */
+ if (start == end &&
+ (partial_start || (partial_end != sb->s_blocksize - 1))) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, length);
+ return err;
+ }
+ /* Handle partial zero out on the start of the range */
+ if (partial_start) {
+ err = ext4_block_zero_page_range(handle, mapping,
+ lstart, sb->s_blocksize);
+ if (err)
+ return err;
+ }
+ /* Handle partial zero out on the end of the range */
+ if (partial_end != sb->s_blocksize - 1)
+ err = ext4_block_zero_page_range(handle, mapping,
+ byte_end - partial_end,
+ partial_end + 1);
return err;
}
@@ -3580,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
{
- struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_page, last_page, page_len;
- loff_t first_page_offset, last_page_offset;
+ loff_t first_block_offset, last_block_offset;
handle_t *handle;
unsigned int credits;
int ret = 0;
@@ -3638,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
offset;
}
- first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
- first_page_offset = first_page << PAGE_CACHE_SHIFT;
- last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
- /* Now release the pages */
- if (last_page_offset > first_page_offset) {
- truncate_pagecache_range(inode, first_page_offset,
- last_page_offset - 1);
- }
+ /* Now release the pages and zero block aligned part of pages*/
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
- ret = ext4_flush_unwritten_io(inode);
- if (ret)
- goto out_dio;
inode_dio_wait(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out_dio;
}
- /*
- * Now we need to zero out the non-page-aligned data in the
- * pages at the start and tail of the hole, and unmap the
- * buffer heads for the block aligned regions of the page that
- * were completely zeroed.
- */
- if (first_page > last_page) {
- /*
- * If the file space being truncated is contained
- * within a page just zero out and unmap the middle of
- * that page
- */
- ret = ext4_discard_partial_page_buffers(handle,
- mapping, offset, length, 0);
-
- if (ret)
- goto out_stop;
- } else {
- /*
- * zero out and unmap the partial page that contains
- * the start of the hole
- */
- page_len = first_page_offset - offset;
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle, mapping,
- offset, page_len, 0);
- if (ret)
- goto out_stop;
- }
-
- /*
- * zero out and unmap the partial page that contains
- * the end of the hole
- */
- page_len = offset + length - last_page_offset;
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle, mapping,
- last_page_offset, page_len, 0);
- if (ret)
- goto out_stop;
- }
- }
-
- /*
- * If i_size is contained in the last page, we need to
- * unmap and zero the partial page after i_size
- */
- if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
- inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (page_len > 0) {
- ret = ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0);
-
- if (ret)
- goto out_stop;
- }
- }
+ ret = ext4_zero_partial_blocks(handle, inode, offset,
+ length);
+ if (ret)
+ goto out_stop;
first_block = (offset + sb->s_blocksize - 1) >>
EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3641,6 @@ void ext4_truncate(struct inode *inode)
unsigned int credits;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
- loff_t page_len;
/*
* There is a possibility that we're either freeing the inode
@@ -3830,12 +3667,6 @@ void ext4_truncate(struct inode *inode)
return;
}
- /*
- * finish any pending end_io work so we won't run the risk of
- * converting any truncated blocks to initialized later
- */
- ext4_flush_unwritten_io(inode);
-
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
else
@@ -3847,14 +3678,8 @@ void ext4_truncate(struct inode *inode)
return;
}
- if (inode->i_size % PAGE_CACHE_SIZE != 0) {
- page_len = PAGE_CACHE_SIZE -
- (inode->i_size & (PAGE_CACHE_SIZE - 1));
-
- if (ext4_discard_partial_page_buffers(handle,
- mapping, inode->i_size, page_len, 0))
- goto out_stop;
- }
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
/*
* We add the inode to the orphan list, so that if this
@@ -4623,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
inode->i_size >> PAGE_CACHE_SHIFT);
if (!page)
return;
- ret = __ext4_journalled_invalidatepage(page, offset);
+ ret = __ext4_journalled_invalidatepage(page, offset,
+ PAGE_CACHE_SIZE - offset);
unlock_page(page);
page_cache_release(page);
if (ret != -EBUSY)
@@ -4805,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode;
- unsigned long delalloc_blocks;
+ unsigned long long delalloc_blocks;
inode = dentry->d_inode;
generic_fillattr(inode, stat);
@@ -4823,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
EXT4_I(inode)->i_reserved_data_blocks);
- stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
return 0;
}
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return ext4_ind_trans_blocks(inode, nrblocks, chunk);
- return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ind_trans_blocks(inode, lblocks);
+ return ext4_ext_index_trans_blocks(inode, pextents);
}
/*
@@ -4845,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -4853,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
int ret = 0;
/*
- * How many index blocks need to touch to modify nrblocks?
- * The "Chunk" flag indicating whether the nrblocks is
- * physically contiguous on disk
- *
- * For Direct IO and fallocate, they calls get_block to allocate
- * one single extent at a time, so they could set the "Chunk" flag
+ * How many index blocks need to touch to map @lblocks logical blocks
+ * to @pextents physical extents?
*/
- idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
ret = idxblocks;
@@ -4868,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks;
- if (chunk)
- groups += 1;
- else
- groups += nrblocks;
-
+ groups = idxblocks + pextents;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -4904,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, 0);
+ ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..a9ff5e5137ca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
+ cond_resched();
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp) {
- ext4_discard_allocated_blocks(ac);
- goto errout;
- }
+ if (*errp)
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(bh && (count > 1));
for (i = 0; i < count; i++) {
+ cond_resched();
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- if (unlikely(!tbh))
+ if (!tbh)
continue;
ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
struct page *pagep[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset;
- long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
orig_blk_offset = orig_page_offset * blocks_per_page +
data_offset_in_page;
- offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-
/* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f91002f8c017..234b834d5a97 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
bh->b_data, bh->b_size,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ ((char *)de - bh->b_data))) {
- /* On error, skip the f_pos to the next block. */
- dir_file->f_pos = (dir_file->f_pos |
- (dir->i_sb->s_blocksize - 1)) + 1;
- brelse(bh);
- return count;
+ /* silently ignore the rest of the block */
+ break;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..48786cdb5e6c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -46,46 +46,121 @@ void ext4_exit_pageio(void)
}
/*
- * This function is called by ext4_evict_inode() to make sure there is
- * no more pending I/O completion work left to do.
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
*/
-void ext4_ioend_shutdown(struct inode *inode)
+static void buffer_io_error(struct buffer_head *bh)
{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
+ char b[BDEVNAME_SIZE];
+ printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ bdevname(bh->b_bdev, b),
+ (unsigned long long)bh->b_blocknr);
+}
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
- /*
- * We need to make sure the work structure is finished being
- * used before we let the inode get destroyed.
- */
- if (work_pending(&EXT4_I(inode)->i_unwritten_work))
- cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
+static void ext4_finish_bio(struct bio *bio)
+{
+ int i;
+ int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ struct bio_vec *bvec = &bio->bi_io_vec[i];
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh, *head;
+ unsigned bio_start = bvec->bv_offset;
+ unsigned bio_end = bio_start + bvec->bv_len;
+ unsigned under_io = 0;
+ unsigned long flags;
+
+ if (!page)
+ continue;
+
+ if (error) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ bh = head = page_buffers(page);
+ /*
+ * We check all buffers in the page under BH_Uptodate_Lock
+ * to avoid races with other end io clearing async_write flags
+ */
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ do {
+ if (bh_offset(bh) < bio_start ||
+ bh_offset(bh) + bh->b_size > bio_end) {
+ if (buffer_async_write(bh))
+ under_io++;
+ continue;
+ }
+ clear_buffer_async_write(bh);
+ if (error)
+ buffer_io_error(bh);
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+ if (!under_io)
+ end_page_writeback(page);
+ }
}
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
{
- BUG_ON(!io);
- BUG_ON(!list_empty(&io->list));
- BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
+ struct bio *bio, *next_bio;
+
+ BUG_ON(!list_empty(&io_end->list));
+ BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+ WARN_ON(io_end->handle);
- if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
- wake_up_all(ext4_ioend_wq(io->inode));
- kmem_cache_free(io_end_cachep, io);
+ if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io_end->inode));
+
+ for (bio = io_end->bio; bio; bio = next_bio) {
+ next_bio = bio->bi_private;
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
+ if (io_end->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(io_end->inode);
+ if (io_end->iocb)
+ aio_complete(io_end->iocb, io_end->result, 0);
+ kmem_cache_free(io_end_cachep, io_end);
}
-/* check a range of space and convert unwritten extents to written. */
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+ struct inode *inode = io_end->inode;
+
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
+}
+
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
ssize_t size = io->size;
+ handle_t *handle = io->handle;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io, inode->i_ino, io->list.next, io->list.prev);
- ret = ext4_convert_unwritten_extents(inode, offset, size);
+ io->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
if (ret < 0) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
@@ -93,30 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
+ ext4_clear_io_unwritten_flag(io);
+ ext4_release_io_end(io);
return ret;
}
-static void dump_completed_IO(struct inode *inode)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
- ext4_debug("inode %lu completed_io list is empty\n",
- inode->i_ino);
+ if (list_empty(head))
return;
- }
- ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+ ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+ list_for_each_entry(io, head, list) {
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +197,30 @@ static void dump_completed_IO(struct inode *inode)
}
/* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct workqueue_struct *wq;
unsigned long flags;
BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (list_empty(&ei->i_completed_io_list))
- queue_work(wq, &ei->i_unwritten_work);
- list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ if (io_end->handle) {
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+ } else {
+ wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+ if (list_empty(&ei->i_unrsv_conversion_list))
+ queue_work(wq, &ei->i_unrsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
+ }
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
-static int ext4_do_flush_completed_IO(struct inode *inode)
+static int ext4_do_flush_completed_IO(struct inode *inode,
+ struct list_head *head)
{
ext4_io_end_t *io;
struct list_head unwritten;
@@ -155,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
int err, ret = 0;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- dump_completed_IO(inode);
- list_replace_init(&ei->i_completed_io_list, &unwritten);
+ dump_completed_IO(inode, head);
+ list_replace_init(head, &unwritten);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
@@ -167,30 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
- io->flag &= ~EXT4_IO_END_UNWRITTEN;
- ext4_free_io_end(io);
}
return ret;
}
/*
- * work on completed aio dio IO, to convert unwritten extents to extents
+ * work on completed IO, to convert unwritten extents to extents
*/
-void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_rsv_work(struct work_struct *work)
{
struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unwritten_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode);
+ i_rsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-int ext4_flush_unwritten_io(struct inode *inode)
+void ext4_end_io_unrsv_work(struct work_struct *work)
{
- int ret;
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
- !(inode->i_state & I_FREEING));
- ret = ext4_do_flush_completed_IO(inode);
- ext4_unwritten_wait(inode);
- return ret;
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_unrsv_conversion_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +269,70 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
+ atomic_set(&io->count, 1);
}
return io;
}
-/*
- * Print an buffer I/O error compatible with the fs/buffer.c. This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message. We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
- char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr);
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ ext4_release_io_end(io_end);
+ return;
+ }
+ ext4_add_complete_io(io_end);
+ }
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+ int err = 0;
+
+ if (atomic_dec_and_test(&io_end->count)) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ err = ext4_convert_unwritten_extents(io_end->handle,
+ io_end->inode, io_end->offset,
+ io_end->size);
+ io_end->handle = NULL;
+ ext4_clear_io_unwritten_flag(io_end);
+ }
+ ext4_release_io_end(io_end);
+ }
+ return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+ atomic_inc(&io_end->count);
+ return io_end;
}
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- struct inode *inode;
- int i;
- int blocksize;
sector_t bi_sector = bio->bi_sector;
BUG_ON(!io_end);
- inode = io_end->inode;
- blocksize = 1 << inode->i_blkbits;
- bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct bio_vec *bvec = &bio->bi_io_vec[i];
- struct page *page = bvec->bv_page;
- struct buffer_head *bh, *head;
- unsigned bio_start = bvec->bv_offset;
- unsigned bio_end = bio_start + bvec->bv_len;
- unsigned under_io = 0;
- unsigned long flags;
- if (!page)
- continue;
-
- if (error) {
- SetPageError(page);
- set_bit(AS_EIO, &page->mapping->flags);
- }
- bh = head = page_buffers(page);
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
/*
- * We check all buffers in the page under BH_Uptodate_Lock
- * to avoid races with other end io clearing async_write flags
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
- do {
- if (bh_offset(bh) < bio_start ||
- bh_offset(bh) + blocksize > bio_end) {
- if (buffer_async_write(bh))
- under_io++;
- continue;
- }
- clear_buffer_async_write(bh);
- if (error)
- buffer_io_error(bh);
- } while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
- if (!under_io)
- end_page_writeback(page);
+ bio->bi_private = xchg(&io_end->bio, bio);
+ } else {
+ ext4_finish_bio(bio);
+ bio_put(bio);
}
- bio_put(bio);
if (error) {
- io_end->flag |= EXT4_IO_END_ERROR;
+ struct inode *inode = io_end->inode;
+
ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
inode->i_ino,
@@ -285,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error)
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
}
-
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- ext4_free_io_end(io_end);
- return;
- }
-
- ext4_add_complete_io(io_end);
+ ext4_put_io_end_defer(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +355,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
bio_put(io->io_bio);
}
io->io_bio = NULL;
- io->io_op = 0;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+ struct writeback_control *wbc)
+{
+ io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_bio = NULL;
io->io_end = NULL;
}
-static int io_submit_init(struct ext4_io_submit *io,
- struct inode *inode,
- struct writeback_control *wbc,
- struct buffer_head *bh)
+static int io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end)
- return -ENOMEM;
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ if (!bio)
+ return -ENOMEM;
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
-
- io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+ bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
io->io_next_block = bh->b_blocknr;
return 0;
}
static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
- struct writeback_control *wbc,
struct buffer_head *bh)
{
- ext4_io_end_t *io_end;
int ret;
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +394,14 @@ submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init(io, inode, wbc, bh);
+ ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
}
- io_end = io->io_end;
- if (test_clear_buffer_uninit(bh))
- ext4_set_io_unwritten_flag(inode, io_end);
- io->io_end->size += bh->b_size;
- io->io_next_block++;
ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
+ io->io_next_block++;
return 0;
}
@@ -432,7 +473,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, wbc, bh);
+ ret = io_submit_add_bh(io, inode, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
ext4_fsblk_t end = start + input->blocks_count;
ext4_group_t group = input->group;
ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext4_group_overhead_blocks(sb, group);
- ext4_fsblk_t metaend = start + overhead;
+ unsigned overhead;
+ ext4_fsblk_t metaend;
struct buffer_head *bh = NULL;
ext4_grpblk_t free_blocks_count, offset;
int err = -EINVAL;
+ if (group != sbi->s_groups_count) {
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+ input->group, sbi->s_groups_count);
+ return -EINVAL;
+ }
+
+ overhead = ext4_group_overhead_blocks(sb, group);
+ metaend = start + overhead;
input->free_blocks_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
free_blocks_count, input->reserved_blocks);
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
- if (group != sbi->s_groups_count)
- ext4_warning(sb, "Cannot add at group %u (only %u groups)",
- input->group, sbi->s_groups_count);
- else if (offset != 0)
+ if (offset != 0)
ext4_warning(sb, "Last group not full");
else if (input->reserved_blocks > input->blocks_count / 5)
ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
struct inode *inode = NULL;
- int gdb_off, gdb_num;
+ int gdb_off;
int err;
__u16 bg_flags = 0;
- gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
err = err2;
if (!err) {
- ext4_fsblk_t first_block;
- first_block = ext4_group_first_block_no(sb, 0);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..85b3dd60169b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
static void ext4_clear_journal_err(struct super_block *sb,
struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
static int ext4_remount(struct super_block *sb, int *flags, char *data);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
}
if (test_opt(sb, ERRORS_RO)) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
sb->s_flags |= MS_RDONLY;
}
if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
ext4_handle_error(sb);
}
-void ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_inode(struct inode *inode, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
ext4_handle_error(inode->i_sb);
}
-void ext4_error_file(struct file *file, const char *function,
- unsigned int line, ext4_fsblk_t block,
- const char *fmt, ...)
+void __ext4_error_file(struct file *file, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
{
va_list args;
struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
if ((sb->s_flags & MS_RDONLY) == 0) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ /*
+ * Make sure updated value of ->s_mount_flags will be visible
+ * before ->s_flags update
+ */
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
panic("EXT4-fs panic from previous error\n");
}
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+ const char *prefix, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->dio_unwritten_wq);
- destroy_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ destroy_workqueue(sbi->unrsv_conversion_wq);
+ destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
- ext4_es_unregister_shrinker(sb);
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_lru);
ei->i_es_lru_nr = 0;
+ ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_quota = 0;
#endif
ei->jinode = NULL;
- INIT_LIST_HEAD(&ei->i_completed_io_list);
+ INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
- INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
+ INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
.dirty_inode = ext4_dirty_inode,
.drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
+ .sync_fs = ext4_sync_fs_nojournal,
.put_super = ext4_put_super,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
@@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
ext4_group_t flex_group;
- unsigned int groups_per_flex = 0;
int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
sbi->s_log_groups_per_flex = 0;
return 1;
}
- groups_per_flex = 1U << sbi->s_log_groups_per_flex;
err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
if (err)
@@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
dquot_initialize(inode);
if (inode->i_nlink) {
- ext4_msg(sb, KERN_DEBUG,
- "%s: truncating inode %lu to %lld bytes",
- __func__, inode->i_ino, inode->i_size);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
mutex_lock(&inode->i_mutex);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
mutex_unlock(&inode->i_mutex);
nr_truncates++;
} else {
- ext4_msg(sb, KERN_DEBUG,
- "%s: deleting unreferenced inode %lu",
- __func__, inode->i_ino);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
nr_orphans++;
@@ -2377,7 +2396,10 @@ struct ext4_attr {
ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
const char *, size_t);
- int offset;
+ union {
+ int offset;
+ int deprecated_val;
+ } u;
};
static int parse_strtoull(const char *buf,
@@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
static ssize_t sbi_ui_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
}
@@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
unsigned long t;
int ret;
@@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
return count;
}
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
- .offset = offsetof(struct ext4_sb_info, _elname), \
+ .u = { \
+ .offset = offsetof(struct ext4_sb_info, _elname),\
+ }, \
}
#define EXT4_ATTR(name, mode, show, store) \
static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
#define EXT4_RW_ATTR_SBI_UI(name, elname) \
EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = sbi_deprecated_show, \
+ .u = { \
+ .deprecated_val = _val, \
+ }, \
+}
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
@@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
- ext4_es_register_shrinker(sb);
+ ext4_es_register_shrinker(sbi);
err = percpu_counter_init(&sbi->s_freeclusters_counter,
ext4_count_free_clusters(sb));
@@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
- sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
/*
@@ -3915,12 +3952,20 @@ no_journal:
* The maximum number of concurrent works can be high and
* concurrency isn't really necessary. Limit it to 1.
*/
- EXT4_SB(sb)->dio_unwritten_wq =
- alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->dio_unwritten_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ EXT4_SB(sb)->rsv_conversion_wq =
+ alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->rsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
ret = -ENOMEM;
- goto failed_mount_wq;
+ goto failed_mount4;
+ }
+
+ EXT4_SB(sb)->unrsv_conversion_wq =
+ alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+ ret = -ENOMEM;
+ goto failed_mount4;
}
/*
@@ -4074,14 +4119,17 @@ failed_mount4a:
sb->s_root = NULL;
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
- destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+ if (EXT4_SB(sb)->rsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ if (EXT4_SB(sb)->unrsv_conversion_wq)
+ destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3:
- ext4_es_unregister_shrinker(sb);
+ ext4_es_unregister_shrinker(sbi);
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- flush_workqueue(sbi->dio_unwritten_wq);
+ flush_workqueue(sbi->rsv_conversion_wq);
+ flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
*/
dquot_writeback_dquots(sb, -1);
+ /*
+ * Data writeback is possible w/o journal transaction, so barrier must
+ * being sent at the end of the function. But we can skip it if
+ * transaction_commit will do it for us.
+ */
+ target = jbd2_get_latest_transaction(sbi->s_journal);
+ if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+ needs_barrier = true;
+
if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(sbi->s_journal, target);
+ ret = jbd2_log_wait_commit(sbi->s_journal, target);
+ }
+ if (needs_barrier) {
+ int err;
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ if (!ret)
+ ret = err;
}
+
+ return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+ int ret = 0;
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+ dquot_writeback_dquots(sb, -1);
+ if (wait && test_opt(sb, BARRIER))
+ ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
return ret;
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
+
+config F2FS_FS_SECURITY
+ bool "F2FS Security Labels"
+ depends on F2FS_FS_XATTR
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the f2fs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f286e0..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
}
}
- error = f2fs_setxattr(inode, name_index, "", value, size);
+ error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
kfree(value);
if (!error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01da1a40..66a6b85a51d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -357,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
unsigned long blk_size = sbi->blocksize;
struct f2fs_checkpoint *cp_block;
unsigned long long cur_version = 0, pre_version = 0;
- unsigned int crc = 0;
size_t crc_offset;
+ __u32 crc = 0;
/* Read the 1st cp block in this CP pack */
cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp1;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp1;
@@ -384,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp2;
- crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp2;
@@ -450,13 +450,30 @@ fail_no_cp:
return -EINVAL;
}
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct list_head *head = &sbi->dir_inode_list;
- struct dir_inode_entry *new;
struct list_head *this;
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode)
+ return -EEXIST;
+ }
+ list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+ sbi->n_dirty_dirs++;
+#endif
+ return 0;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+
if (!S_ISDIR(inode->i_mode))
return;
retry:
@@ -469,23 +486,31 @@ retry:
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
- if (entry->inode == inode) {
- kmem_cache_free(inode_entry_slab, new);
- goto out;
- }
- }
- list_add_tail(&new->list, head);
- sbi->n_dirty_dirs++;
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
- BUG_ON(!S_ISDIR(inode->i_mode));
-out:
inc_page_count(sbi, F2FS_DIRTY_DENTS);
inode_inc_dirty_dents(inode);
SetPagePrivate(page);
+ spin_unlock(&sbi->dir_inode_lock);
+}
+void add_dirty_dir_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct dir_inode_entry *new;
+retry:
+ new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->inode = inode;
+ INIT_LIST_HEAD(&new->list);
+
+ spin_lock(&sbi->dir_inode_lock);
+ if (__add_dirty_inode(inode, new))
+ kmem_cache_free(inode_entry_slab, new);
spin_unlock(&sbi->dir_inode_lock);
}
@@ -499,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
return;
spin_lock(&sbi->dir_inode_lock);
- if (atomic_read(&F2FS_I(inode)->dirty_dents))
- goto out;
+ if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+ spin_unlock(&sbi->dir_inode_lock);
+ return;
+ }
list_for_each(this, head) {
struct dir_inode_entry *entry;
@@ -508,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
if (entry->inode == inode) {
list_del(&entry->list);
kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->n_dirty_dirs--;
+#endif
+ break;
+ }
+ }
+ spin_unlock(&sbi->dir_inode_lock);
+
+ /* Only from the recovery routine */
+ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+ clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+ iput(inode);
+ }
+}
+
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *head = &sbi->dir_inode_list;
+ struct list_head *this;
+ struct inode *inode = NULL;
+
+ spin_lock(&sbi->dir_inode_lock);
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode->i_ino == ino) {
+ inode = entry->inode;
break;
}
}
-out:
spin_unlock(&sbi->dir_inode_lock);
+ return inode;
}
void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
block_t start_blk;
struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
- unsigned int crc32 = 0;
+ __u32 crc32 = 0;
void *kaddr;
int i;
@@ -664,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
- *(__le32 *)((unsigned char *)ckpt +
- le32_to_cpu(ckpt->checksum_offset))
+ *((__le32 *)((unsigned char *)ckpt +
+ le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
start_blk = __start_cp_addr(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..035f9a345cdf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
struct buffer_head *bh_result)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
pgoff_t start_fofs, end_fofs;
block_t start_blkaddr;
@@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
return 0;
}
+#ifdef CONFIG_F2FS_STAT_FS
sbi->total_hit_ext++;
+#endif
start_fofs = fi->ext.fofs;
end_fofs = fi->ext.fofs + fi->ext.len - 1;
start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
else
bh_result->b_size = UINT_MAX;
+#ifdef CONFIG_F2FS_STAT_FS
sbi->read_hit_ext++;
+#endif
read_unlock(&fi->ext.ext_lock);
return 1;
}
@@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
if (dn.data_blkaddr == NEW_ADDR)
return ERR_PTR(-EINVAL);
- page = grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
struct page *page;
int err;
+repeat:
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
+ }
f2fs_put_dnode(&dn);
- if (dn.data_blkaddr == NULL_ADDR)
+ if (dn.data_blkaddr == NULL_ADDR) {
+ f2fs_put_page(page, 1);
return ERR_PTR(-ENOENT);
-repeat:
- page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ }
if (PageUptodate(page))
return page;
@@ -274,9 +285,10 @@ repeat:
*
* Also, caller should grab and release a mutex by calling mutex_lock_op() and
* mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
*/
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
- bool new_i_size)
+struct page *get_new_data_page(struct inode *inode,
+ struct page *npage, pgoff_t index, bool new_i_size)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
struct dnode_of_data dn;
int err;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ set_new_dnode(&dn, inode, npage, npage, 0);
err = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (err)
return ERR_PTR(err);
if (dn.data_blkaddr == NULL_ADDR) {
if (reserve_new_block(&dn)) {
- f2fs_put_dnode(&dn);
+ if (!npage)
+ f2fs_put_dnode(&dn);
return ERR_PTR(-ENOSPC);
}
}
- f2fs_put_dnode(&dn);
+ if (!npage)
+ f2fs_put_dnode(&dn);
repeat:
page = grab_cache_page(mapping, index);
if (!page)
@@ -325,6 +339,8 @@ repeat:
if (new_i_size &&
i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ /* Only the directory inode sets new_i_size */
+ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
mark_inode_dirty_sync(inode);
}
return page;
@@ -481,8 +497,9 @@ int do_write_data_page(struct page *page)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
- need_inplace_update(inode)) {
+ if (unlikely(old_blk_addr != NEW_ADDR &&
+ !is_cold_data(page) &&
+ need_inplace_update(inode))) {
rewrite_data_page(F2FS_SB(inode->i_sb), page,
old_blk_addr);
} else {
@@ -684,6 +701,27 @@ err:
return err;
}
+static int f2fs_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ if (pos + copied > i_size_read(inode)) {
+ i_size_write(inode, pos + copied);
+ mark_inode_dirty(inode);
+ update_inode_page(inode);
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+ return copied;
+}
+
static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
@@ -698,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
get_data_block_ro);
}
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
- .write_end = nobh_write_end,
+ .write_end = f2fs_write_end,
.set_page_dirty = f2fs_set_data_page_dirty,
.invalidatepage = f2fs_invalidate_data_page,
.releasepage = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d9943786c31..0d6c6aafb235 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -175,12 +175,12 @@ get_cache:
static int stat_show(struct seq_file *s, void *v)
{
- struct f2fs_stat_info *si, *next;
+ struct f2fs_stat_info *si;
int i = 0;
int j;
mutex_lock(&f2fs_stat_mutex);
- list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+ list_for_each_entry(si, &f2fs_stat_list, stat_list) {
char devname[BDEVNAME_SIZE];
update_general_status(si->sbi);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 600bb5efe603..9d1cd423450d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
#include "f2fs.h"
#include "node.h"
#include "acl.h"
+#include "xattr.h"
static unsigned long dir_blocks(struct inode *inode)
{
@@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
{
- struct page *page = NULL;
- struct f2fs_dir_entry *de = NULL;
- struct f2fs_dentry_block *dentry_blk = NULL;
+ struct page *page;
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_block *dentry_blk;
page = get_lock_data_page(dir, 0);
if (IS_ERR(page))
@@ -264,15 +265,10 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_put_page(page, 1);
}
-void init_dent_inode(const struct qstr *name, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_node *rn;
- if (IS_ERR(ipage))
- return;
-
- wait_on_page_writeback(ipage);
-
/* copy name info. to this inode page */
rn = (struct f2fs_node *)page_address(ipage);
rn->i.i_namelen = cpu_to_le32(name->len);
@@ -280,14 +276,15 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-static int make_empty_dir(struct inode *inode, struct inode *parent)
+static int make_empty_dir(struct inode *inode,
+ struct inode *parent, struct page *page)
{
struct page *dentry_page;
struct f2fs_dentry_block *dentry_blk;
struct f2fs_dir_entry *de;
void *kaddr;
- dentry_page = get_new_data_page(inode, 0, true);
+ dentry_page = get_new_data_page(inode, page, 0, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
@@ -317,63 +314,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent)
return 0;
}
-static int init_inode_metadata(struct inode *inode,
+static struct page *init_inode_metadata(struct inode *inode,
struct inode *dir, const struct qstr *name)
{
+ struct page *page;
+ int err;
+
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
- int err;
- err = new_inode_page(inode, name);
- if (err)
- return err;
+ page = new_inode_page(inode, name);
+ if (IS_ERR(page))
+ return page;
if (S_ISDIR(inode->i_mode)) {
- err = make_empty_dir(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ err = make_empty_dir(inode, dir, page);
+ if (err)
+ goto error;
}
err = f2fs_init_acl(inode, dir);
- if (err) {
- remove_inode_page(inode);
- return err;
- }
+ if (err)
+ goto error;
+
+ err = f2fs_init_security(inode, dir, name, page);
+ if (err)
+ goto error;
+
+ wait_on_page_writeback(page);
} else {
- struct page *ipage;
- ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
- set_cold_node(inode, ipage);
- init_dent_inode(name, ipage);
- f2fs_put_page(ipage, 1);
+ page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ if (IS_ERR(page))
+ return page;
+
+ wait_on_page_writeback(page);
+ set_cold_node(inode, page);
}
+
+ init_dent_inode(name, page);
+
+ /*
+ * This file should be checkpointed during fsync.
+ * We lost i_pino from now on.
+ */
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ file_lost_pino(inode);
inc_nlink(inode);
- update_inode_page(inode);
}
- return 0;
+ return page;
+
+error:
+ f2fs_put_page(page, 1);
+ remove_inode_page(inode);
+ return ERR_PTR(err);
}
static void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- bool need_dir_update = false;
-
if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
if (S_ISDIR(inode->i_mode)) {
inc_nlink(dir);
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
if (F2FS_I(dir)->i_current_depth != current_depth) {
F2FS_I(dir)->i_current_depth = current_depth;
- need_dir_update = true;
+ set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
- if (need_dir_update)
+ if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
update_inode_page(dir);
else
mark_inode_dirty(dir);
@@ -423,6 +433,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
int slots = GET_DENTRY_SLOTS(namelen);
+ struct page *page;
int err = 0;
int i;
@@ -448,7 +459,7 @@ start:
bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
- dentry_page = get_new_data_page(dir, block, true);
+ dentry_page = get_new_data_page(dir, NULL, block, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
@@ -465,12 +476,13 @@ start:
++level;
goto start;
add_dentry:
- err = init_inode_metadata(inode, dir, name);
- if (err)
- goto fail;
-
wait_on_page_writeback(dentry_page);
+ page = init_inode_metadata(inode, dir, name);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
de = &dentry_blk->dentry[bit_pos];
de->hash_code = dentry_hash;
de->name_len = cpu_to_le16(namelen);
@@ -481,11 +493,14 @@ add_dentry:
test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
set_page_dirty(dentry_page);
- update_parent_metadata(dir, inode, current_depth);
-
- /* update parent inode number before releasing dentry page */
+ /* we don't need to mark_inode_dirty now */
F2FS_I(inode)->i_pino = dir->i_ino;
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
+
+ update_parent_metadata(dir, inode, current_depth);
fail:
+ clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02f2a42..467d42d65c48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
typecheck(unsigned long long, b) && \
((long long)((a) - (b)) > 0))
-typedef u64 block_t;
+typedef u32 block_t; /*
+ * should not change u32, since it is the on-disk block
+ * address format, __le32.
+ */
typedef u32 nid_t;
struct f2fs_mount_info {
unsigned int opt;
};
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+
+static inline __u32 f2fs_crc32(void *buf, size_t len)
{
- return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+ unsigned char *p = (unsigned char *)buf;
+ __u32 crc = F2FS_SUPER_MAGIC;
+ int i;
+
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+ }
+ return crc;
}
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
{
- return f2fs_crc32(buff, buff_size) == blk_crc;
+ return f2fs_crc32(buf, buf_size) == blk_crc;
}
/*
@@ -148,7 +162,7 @@ struct extent_info {
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
*/
#define FADVISE_COLD_BIT 0x01
-#define FADVISE_CP_BIT 0x02
+#define FADVISE_LOST_PINO_BIT 0x02
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
@@ -369,7 +383,6 @@ struct f2fs_sb_info {
/* for directory inode management */
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
- unsigned int n_dirty_dirs; /* # of dir inodes */
/* basic file system units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -406,12 +419,15 @@ struct f2fs_sb_info {
* for stat information.
* one is for the LFS mode, and the other is for the SSR mode.
*/
+#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_stat_info *stat_info; /* FS status information */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
- unsigned int last_victim[2]; /* last victim segment # */
int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
int bg_gc; /* background gc calls */
+ unsigned int n_dirty_dirs; /* # of dir inodes */
+#endif
+ unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
};
@@ -495,9 +511,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
{
- int i = 0;
- for (; i < NR_GLOBAL_LOCKS; i++)
- mutex_lock(&sbi->fs_lock[i]);
+ int i;
+
+ for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+ /*
+ * This is the only time we take multiple fs_lock[]
+ * instances; the order is immaterial since we
+ * always hold cp_mutex, which serializes multiple
+ * such operations.
+ */
+ mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+ }
}
static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -843,9 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_DIRTY_INODE, /* indicate inode is dirty or not */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_UPDATE_DIR, /* should update inode block for consistency */
+ FI_DELAY_IPUT, /* used for the recovery */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
return 0;
}
+static inline int f2fs_readonly(struct super_block *sb)
+{
+ return sb->s_flags & MS_RDONLY;
+}
+
/*
* file.c
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
long f2fs_ioctl(struct file *, unsigned int, unsigned long);
long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -913,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
-void init_dent_inode(const struct qstr *, struct page *);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, const struct qstr *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_inode_page(struct inode *, const struct qstr *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_node_page_ra(struct page *, int);
@@ -974,7 +1007,6 @@ void destroy_node_manager_caches(void);
*/
void f2fs_balance_fs(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
void clear_prefree_segments(struct f2fs_sb_info *);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
@@ -1011,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
void write_checkpoint(struct f2fs_sb_info *, bool);
void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1059,7 @@ int reserve_new_block(struct dnode_of_data *);
void update_extent_cache(block_t, struct dnode_of_data *);
struct page *find_data_page(struct inode *, pgoff_t, bool);
struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
int do_write_data_page(struct page *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864f8dfc..d2d2b7dbdcc1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, ilock);
+ file_update_time(vma->vm_file);
lock_page(page);
if (page->mapping != inode->i_mapping ||
- page_offset(page) >= i_size_read(inode) ||
+ page_offset(page) > i_size_read(inode) ||
!PageUptodate(page)) {
unlock_page(page);
err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
* check to see if the page is mapped already (no holes)
*/
if (PageMappedToDisk(page))
- goto out;
-
- /* fill the page */
- wait_on_page_writeback(page);
+ goto mapped;
/* page is wholly or partially inside EOF */
if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
set_page_dirty(page);
SetPageUptodate(page);
- file_update_time(vma->vm_file);
+mapped:
+ /* fill the page */
+ wait_on_page_writeback(page);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(err);
@@ -102,6 +102,24 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
.remap_pages = generic_file_remap_pages,
};
+static int get_parent_ino(struct inode *inode, nid_t *pino)
+{
+ struct dentry *dentry;
+
+ inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ iput(inode);
+ if (!dentry)
+ return 0;
+
+ inode = igrab(dentry->d_parent->d_inode);
+ dput(dentry);
+
+ *pino = inode->i_ino;
+ iput(inode);
+ return 1;
+}
+
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
@@ -114,7 +132,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
.for_reclaim = 0,
};
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(inode->i_sb))
return 0;
trace_f2fs_sync_file_enter(inode);
@@ -134,7 +152,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
- else if (is_cp_file(inode))
+ else if (file_wrong_pino(inode))
need_cp = true;
else if (!space_for_roll_forward(sbi))
need_cp = true;
@@ -142,11 +160,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
need_cp = true;
if (need_cp) {
+ nid_t pino;
+
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
+ if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+ get_parent_ino(inode, &pino)) {
+ F2FS_I(inode)->i_pino = pino;
+ file_got_pino(inode);
+ mark_inode_dirty_sync(inode);
+ ret = f2fs_write_inode(inode, NULL);
+ if (ret)
+ goto out;
+ }
} else {
/* if there is no written node page, write its inode page */
while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+ mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
@@ -168,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -185,10 +215,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
update_extent_cache(NULL_ADDR, dn);
invalidate_blocks(sbi, blkaddr);
- dec_valid_block_count(sbi, dn->inode, 1);
nr_free++;
}
if (nr_free) {
+ dec_valid_block_count(sbi, dn->inode, nr_free);
set_page_dirty(dn->node_page);
sync_inode_page(dn);
}
@@ -291,7 +321,7 @@ void f2fs_truncate(struct inode *inode)
}
}
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -387,7 +417,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
f2fs_balance_fs(sbi);
ilock = mutex_lock_op(sbi);
- page = get_new_data_page(inode, index, false);
+ page = get_new_data_page(inode, NULL, index, false);
mutex_unlock_op(sbi, ilock);
if (!IS_ERR(page)) {
@@ -575,10 +605,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
int ret;
switch (cmd) {
- case FS_IOC_GETFLAGS:
+ case F2FS_IOC_GETFLAGS:
flags = fi->i_flags & FS_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
- case FS_IOC_SETFLAGS:
+ case F2FS_IOC_SETFLAGS:
{
unsigned int oldflags;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 14961593e93c..35f9b1a196aa 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,7 +76,9 @@ static int gc_thread_func(void *data)
else
wait_ms = increase_sleep_time(wait_ms);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->bg_gc++;
+#endif
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi))
@@ -89,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
+ int err = 0;
if (!test_opt(sbi, BG_GC))
- return 0;
+ goto out;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
- if (!gc_th)
- return -ENOMEM;
+ if (!gc_th) {
+ err = -ENOMEM;
+ goto out;
+ }
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
+ err = PTR_ERR(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
- return -ENOMEM;
}
- return 0;
+
+out:
+ return err;
}
void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
- unsigned int secno;
+ unsigned int secno, max_cost;
int nsearched = 0;
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
- p.min_cost = get_max_cost(sbi, &p);
+ p.min_cost = max_cost = get_max_cost(sbi, &p);
mutex_lock(&dirty_i->seglist_lock);
@@ -280,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_cost = cost;
}
- if (cost == get_max_cost(sbi, &p))
+ if (cost == max_cost)
continue;
if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +295,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
break;
}
}
-got_it:
if (p.min_segno != NULL_SEGNO) {
+got_it:
if (p.alloc_mode == LFS) {
secno = GET_SECNO(sbi, p.min_segno);
if (gc_type == FG_GC)
@@ -314,28 +321,21 @@ static const struct victim_selection default_v_ops = {
static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
{
- struct list_head *this;
struct inode_entry *ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
+ list_for_each_entry(ie, ilist, list)
if (ie->inode->i_ino == ino)
return ie->inode;
- }
return NULL;
}
static void add_gc_inode(struct inode *inode, struct list_head *ilist)
{
- struct list_head *this;
- struct inode_entry *new_ie, *ie;
+ struct inode_entry *new_ie;
- list_for_each(this, ilist) {
- ie = list_entry(this, struct inode_entry, list);
- if (ie->inode == inode) {
- iput(inode);
- return;
- }
+ if (inode == find_gc_inode(inode->i_ino, ilist)) {
+ iput(inode);
+ return;
}
repeat:
new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9d88ee..2b2d45d19e3e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
ret = do_read_inode(inode);
if (ret)
goto bad_inode;
-
- if (!sbi->por_doing && inode->i_nlink == 0) {
- ret = -ENOENT;
- goto bad_inode;
- }
-
make_now:
if (ino == F2FS_NODE_INO(sbi)) {
inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
- __GFP_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page)
set_cold_node(inode, node_page);
set_page_dirty(node_page);
+ clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
}
int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
+ if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+ return 0;
+
if (wbc)
f2fs_balance_fs(sbi);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc9722b17..64c07169df05 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
int count = le32_to_cpu(sbi->raw_super->extension_count);
for (i = 0; i < count; i++) {
if (is_multimedia_file(name, extlist[i])) {
- set_cold_file(inode);
+ file_set_cold(inode);
break;
}
}
@@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
alloc_nid_done(sbi, ino);
- if (!sbi->por_doing)
- d_instantiate(dentry, inode);
+ d_instantiate(dentry, inode);
unlock_new_inode(inode);
return 0;
out:
@@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_balance_fs(sbi);
inode->i_ctime = CURRENT_TIME;
- atomic_inc(&inode->i_count);
+ ihold(inode);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto out;
- /*
- * This file should be checkpointed during fsync.
- * We lost i_pino from now on.
- */
- set_cp_file(inode);
-
d_instantiate(dentry, inode);
return 0;
out:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
- make_bad_inode(inode);
iput(inode);
return err;
}
@@ -498,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
.rename = f2fs_rename,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
@@ -522,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
};
const struct inode_operations f2fs_special_inode_operations = {
+ .getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..b418aee09573 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
level = get_node_path(index, offset, noffset);
nids[0] = dn->inode->i_ino;
- npage[0] = get_node_page(sbi, nids[0]);
- if (IS_ERR(npage[0]))
- return PTR_ERR(npage[0]);
+ npage[0] = dn->inode_page;
+ if (!npage[0]) {
+ npage[0] = get_node_page(sbi, nids[0]);
+ if (IS_ERR(npage[0]))
+ return PTR_ERR(npage[0]);
+ }
parent = npage[0];
if (level != 0)
nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
dn->nid = nids[i];
- npage[i] = new_node_page(dn, noffset[i]);
+ npage[i] = new_node_page(dn, noffset[i], NULL);
if (IS_ERR(npage[i])) {
alloc_nid_failed(sbi, nids[i]);
err = PTR_ERR(npage[i]);
@@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode)
return 0;
}
-int new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
{
- struct page *page;
struct dnode_of_data dn;
/* allocate inode page for new inode */
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- page = new_node_page(&dn, 0);
- init_dent_inode(name, page);
- if (IS_ERR(page))
- return PTR_ERR(page);
- f2fs_put_page(page, 1);
- return 0;
+
+ /* caller should f2fs_put_page(page, 1); */
+ return new_node_page(&dn, 0, NULL);
}
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+ unsigned int ofs, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
set_cold_node(dn->inode, page);
dn->node_page = page;
- sync_inode_page(dn);
+ if (ipage)
+ update_inode(dn->inode, ipage);
+ else
+ sync_inode_page(dn);
set_page_dirty(page);
if (ofs == 0)
inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
return 0;
}
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
new_ni = old_ni;
new_ni.ino = ino;
+ if (!inc_valid_node_count(sbi, NULL, 1))
+ WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR);
inc_valid_inode_count(sbi);
-
f2fs_put_page(ipage, 1);
return 0;
}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f0024d..c65fb4f4230f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -275,25 +275,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
{
- return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+ return F2FS_I(inode)->i_advise & type;
}
-static inline void set_cold_file(struct inode *inode)
+static inline void set_file(struct inode *inode, int type)
{
- F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+ F2FS_I(inode)->i_advise |= type;
}
-static inline int is_cp_file(struct inode *inode)
+static inline void clear_file(struct inode *inode, int type)
{
- return F2FS_I(inode)->i_advise & FADVISE_CP_BIT;
+ F2FS_I(inode)->i_advise &= ~type;
}
-static inline void set_cp_file(struct inode *inode)
-{
- F2FS_I(inode)->i_advise |= FADVISE_CP_BIT;
-}
+#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
static inline int is_cold_data(struct page *page)
{
@@ -310,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
ClearPageChecked(page);
}
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
{
void *kaddr = page_address(page);
struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << COLD_BIT_SHIFT);
+ return le32_to_cpu(rn->footer.flag) & (1 << type);
}
-static inline unsigned char is_fsync_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << FSYNC_BIT_SHIFT);
-}
-
-static inline unsigned char is_dent_dnode(struct page *page)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- return flag & (0x1 << DENT_BIT_SHIFT);
-}
+#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
+#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
static inline void set_cold_node(struct inode *inode, struct page *page)
{
@@ -346,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
rn->footer.flag = cpu_to_le32(flag);
}
-static inline void set_fsync_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
- unsigned int flag = le32_to_cpu(rn->footer.flag);
- if (mark)
- flag |= (0x1 << FSYNC_BIT_SHIFT);
- else
- flag &= ~(0x1 << FSYNC_BIT_SHIFT);
- rn->footer.flag = cpu_to_le32(flag);
-}
-
-static inline void set_dentry_mark(struct page *page, int mark)
-{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (mark)
- flag |= (0x1 << DENT_BIT_SHIFT);
+ flag |= (0x1 << type);
else
- flag &= ~(0x1 << DENT_BIT_SHIFT);
+ flag &= ~(0x1 << type);
rn->footer.flag = cpu_to_le32(flag);
}
+#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a5097058..d56d951c2253 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
static int recover_dentry(struct page *ipage, struct inode *inode)
{
- struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+ void *kaddr = page_address(ipage);
+ struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
struct f2fs_inode *raw_inode = &(raw_node->i);
- struct qstr name;
+ nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
+ struct qstr name;
struct page *page;
- struct inode *dir;
+ struct inode *dir, *einode;
int err = 0;
- if (!is_dent_dnode(ipage))
- goto out;
-
- dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
+ dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
+ if (!dir) {
+ dir = f2fs_iget(inode->i_sb, pino);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto out;
+ }
+ set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+ add_dirty_dir_inode(dir);
}
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
-
+retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de) {
+ if (de && inode->i_ino == le32_to_cpu(de->ino)) {
kunmap(page);
f2fs_put_page(page, 0);
- } else {
- err = __f2fs_add_link(dir, &name, inode);
+ goto out;
+ }
+ if (de) {
+ einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+ if (IS_ERR(einode)) {
+ WARN_ON(1);
+ if (PTR_ERR(einode) == -ENOENT)
+ err = -EEXIST;
+ goto out;
+ }
+ f2fs_delete_entry(de, page, einode);
+ iput(einode);
+ goto retry;
}
- iput(dir);
+ err = __f2fs_add_link(dir, &name, inode);
out:
- kunmap(ipage);
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+ "ino = %x, name = %s, dir = %lx, err = %d",
+ ino_of_node(ipage), raw_inode->i_name,
+ IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
@@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
struct f2fs_inode *raw_inode = &(raw_node->i);
+ if (!IS_INODE(node_page))
+ return 0;
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_size_write(inode, le64_to_cpu(raw_inode->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- return recover_dentry(node_page, inode);
+ if (is_dent_dnode(node_page))
+ return recover_dentry(node_page, inode);
+
+ f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+ ino_of_node(node_page), raw_inode->i_name);
+ return 0;
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +145,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
lock_page(page);
if (cp_ver != cpver_of_node(page))
- goto unlock_out;
+ break;
if (!is_fsync_dnode(page))
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
if (entry) {
- entry->blkaddr = blkaddr;
if (IS_INODE(page) && is_dent_dnode(page))
set_inode_flag(F2FS_I(entry->inode),
FI_INC_LINK);
@@ -134,48 +159,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
- goto unlock_out;
+ break;
}
/* add this fsync inode to the list */
entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
if (!entry) {
err = -ENOMEM;
- goto unlock_out;
+ break;
}
entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
if (IS_ERR(entry->inode)) {
err = PTR_ERR(entry->inode);
kmem_cache_free(fsync_entry_slab, entry);
- goto unlock_out;
+ break;
}
-
list_add_tail(&entry->list, head);
- entry->blkaddr = blkaddr;
- }
- if (IS_INODE(page)) {
- err = recover_inode(entry->inode, page);
- if (err == -ENOENT) {
- goto next;
- } else if (err) {
- err = -EINVAL;
- goto unlock_out;
- }
}
+ entry->blkaddr = blkaddr;
+
+ err = recover_inode(entry->inode, page);
+ if (err && err != -ENOENT)
+ break;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
-unlock_out:
unlock_page(page);
out:
__free_pages(page, 0);
return err;
}
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
- struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
@@ -186,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
}
}
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
- block_t blkaddr)
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+ block_t blkaddr, struct dnode_of_data *dn)
{
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
(sbi->blocks_per_seg - 1);
struct f2fs_summary sum;
- nid_t ino;
+ nid_t ino, nid;
void *kaddr;
struct inode *inode;
struct page *node_page;
@@ -203,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
sentry = get_seg_entry(sbi, segno);
if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
- return;
+ return 0;
/* Get the previous summary */
for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +239,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
f2fs_put_page(sum_page, 1);
}
+ /* Use the locked dnode page and inode */
+ nid = le32_to_cpu(sum.nid);
+ if (dn->inode->i_ino == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.nid = nid;
+ tdn.node_page = dn->inode_page;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ } else if (dn->nid == nid) {
+ struct dnode_of_data tdn = *dn;
+ tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ truncate_data_blocks_range(&tdn, 1);
+ return 0;
+ }
+
/* Get the node page */
- node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
bidx = start_bidx_of_node(ofs_of_node(node_page)) +
- le16_to_cpu(sum.ofs_in_node);
+ le16_to_cpu(sum.ofs_in_node);
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
/* Deallocate previous index in the node page */
inode = f2fs_iget(sbi->sb, ino);
if (IS_ERR(inode))
- return;
+ return PTR_ERR(inode);
truncate_hole(inode, bidx, bidx + 1);
iput(inode);
+ return 0;
}
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +281,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
- int err = 0;
+ int err = 0, recovered = 0;
int ilock;
start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +319,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* Check the previous node page having this index */
- check_index_in_prev_nodes(sbi, dest);
+ err = check_index_in_prev_nodes(sbi, dest, &dn);
+ if (err)
+ goto err;
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* write dummy data page */
recover_data_page(sbi, NULL, &sum, src, dest);
update_extent_cache(dest, &dn);
+ recovered++;
}
dn.ofs_in_node++;
}
@@ -305,9 +344,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
set_page_dirty(dn.node_page);
recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
f2fs_put_dnode(&dn);
mutex_unlock_op(sbi, ilock);
- return 0;
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+ "recovered_data = %d blocks, err = %d",
+ inode->i_ino, recovered, err);
+ return err;
}
static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +384,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
lock_page(page);
if (cp_ver != cpver_of_node(page))
- goto unlock_out;
+ break;
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry)
@@ -348,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
err = do_recover_data(sbi, entry->inode, page, blkaddr);
if (err)
- goto out;
+ break;
if (entry->blkaddr == blkaddr) {
iput(entry->inode);
@@ -359,7 +403,6 @@ next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
-unlock_out:
unlock_page(page);
out:
__free_pages(page, 0);
@@ -382,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&inode_list);
/* step #1: find fsynced inode numbers */
+ sbi->por_doing = 1;
err = find_fsync_dnodes(sbi, &inode_list);
if (err)
goto out;
@@ -390,13 +434,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
goto out;
/* step #2: recover data */
- sbi->por_doing = 1;
err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- sbi->por_doing = 0;
BUG_ON(!list_empty(&inode_list));
out:
- destroy_fsync_dnodes(sbi, &inode_list);
+ destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
- write_checkpoint(sbi, false);
+ sbi->por_doing = 0;
+ if (!err)
+ write_checkpoint(sbi, false);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..a86d125a9885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
* Adding dirty entry into seglist is not critical operation.
* If a given segment is one of current working segments, it won't be added.
*/
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned short valid_blocks;
@@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
__set_test_and_free(sbi, segno);
- offset = segno + 1;
}
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno, offset = 0;
+ unsigned int segno = -1;
unsigned int total_segs = TOTAL_SEGS(sbi);
mutex_lock(&dirty_i->seglist_lock);
while (1) {
segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- offset);
+ segno + 1);
if (segno >= total_segs)
break;
- offset = segno + 1;
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
dirty_i->nr_dirty[PRE]--;
@@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
* This function should be resided under the curseg_mutex lock
*/
static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
- struct f2fs_summary *sum, unsigned short offset)
+ struct f2fs_summary *sum)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
void *addr = curseg->sum_blk;
- addr += offset * sizeof(struct f2fs_summary);
+ addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
memcpy(addr, sum, sizeof(struct f2fs_summary));
return;
}
@@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
-{
- struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
- unsigned int segno;
- unsigned int ofs = 0;
-
- /*
- * If there is not enough reserved sections,
- * we should not reuse prefree segments.
- */
- if (has_not_enough_free_secs(sbi, 0))
- return NULL_SEGNO;
-
- /*
- * NODE page should not reuse prefree segment,
- * since those information is used for SPOR.
- */
- if (IS_NODESEG(type))
- return NULL_SEGNO;
-next:
- segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
- ofs += sbi->segs_per_sec;
-
- if (segno < TOTAL_SEGS(sbi)) {
- int i;
-
- /* skip intermediate segments in a section */
- if (segno % sbi->segs_per_sec)
- goto next;
-
- /* skip if the section is currently used */
- if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
- goto next;
-
- /* skip if whole section is not prefree */
- for (i = 1; i < sbi->segs_per_sec; i++)
- if (!test_bit(segno + i, prefree_segmap))
- goto next;
-
- /* skip if whole section was not free at the last checkpoint */
- for (i = 0; i < sbi->segs_per_sec; i++)
- if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
- goto next;
-
- return segno;
- }
- return NULL_SEGNO;
-}
-
static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned int segno = curseg->segno;
+ unsigned int segno = curseg->segno + 1;
struct free_segmap_info *free_i = FREE_I(sbi);
- if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec)
- return !test_bit(segno + 1, free_i->free_segmap);
+ if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+ return !test_bit(segno, free_i->free_segmap);
return 0;
}
@@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
int dir = ALLOC_LEFT;
write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ GET_SUM_BLOCK(sbi, segno));
if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
dir = ALLOC_RIGHT;
@@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
goto out;
}
- curseg->next_segno = check_prefree_segments(sbi, type);
-
- if (curseg->next_segno != NULL_SEGNO)
- change_curseg(sbi, type, false);
- else if (type == CURSEG_WARM_NODE)
+ if (type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
else
new_curseg(sbi, type, false);
out:
+#ifdef CONFIG_F2FS_STAT_FS
sbi->segment_count[curseg->alloc_type]++;
+#endif
+ return;
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -795,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
if (S_ISDIR(inode->i_mode))
return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || is_cold_file(inode))
+ else if (is_cold_data(page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
else
return CURSEG_WARM_DATA;
@@ -844,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
* because, this function updates a summary entry in the
* current summary block.
*/
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
mutex_lock(&sit_i->sentry_lock);
__refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
sbi->block_count[curseg->alloc_type]++;
+#endif
/*
* SIT information should be updated before segment allocation,
@@ -943,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
@@ -980,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
}
curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
(sbi->blocks_per_seg - 1);
- __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+ __add_sum_entry(sbi, type, sum);
/* change the current log to the next block addr in advance */
if (next_segno != segno) {
@@ -1579,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int segno = 0, offset = 0;
+ unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
unsigned short valid_blocks;
- while (segno < TOTAL_SEGS(sbi)) {
+ while (1) {
/* find dirty segment based on free segmap */
- segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
- if (segno >= TOTAL_SEGS(sbi))
+ segno = find_next_inuse(free_i, total_segs, offset);
+ if (segno >= total_segs)
break;
offset = segno + 1;
valid_blocks = get_valid_blocks(sbi, segno, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7df82c7..75c7dc363e92 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -34,7 +34,7 @@
static struct kmem_cache *f2fs_inode_cachep;
enum {
- Opt_gc_background_off,
+ Opt_gc_background,
Opt_disable_roll_forward,
Opt_discard,
Opt_noheap,
@@ -46,7 +46,7 @@ enum {
};
static match_table_t f2fs_tokens = {
- {Opt_gc_background_off, "background_gc_off"},
+ {Opt_gc_background, "background_gc=%s"},
{Opt_disable_roll_forward, "disable_roll_forward"},
{Opt_discard, "discard"},
{Opt_noheap, "no_heap"},
@@ -76,6 +76,91 @@ static void init_once(void *foo)
inode_init_once(&fi->vfs_inode);
}
+static int parse_options(struct super_block *sb, char *options)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *name;
+ int arg = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, f2fs_tokens, args);
+
+ switch (token) {
+ case Opt_gc_background:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (!strncmp(name, "on", 2))
+ set_opt(sbi, BG_GC);
+ else if (!strncmp(name, "off", 3))
+ clear_opt(sbi, BG_GC);
+ else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_disable_roll_forward:
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_discard:
+ set_opt(sbi, DISCARD);
+ break;
+ case Opt_noheap:
+ set_opt(sbi, NOHEAP);
+ break;
+#ifdef CONFIG_F2FS_FS_XATTR
+ case Opt_nouser_xattr:
+ clear_opt(sbi, XATTR_USER);
+ break;
+#else
+ case Opt_nouser_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "nouser_xattr options not supported");
+ break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ case Opt_noacl:
+ clear_opt(sbi, POSIX_ACL);
+ break;
+#else
+ case Opt_noacl:
+ f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+ break;
+#endif
+ case Opt_active_logs:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+ return -EINVAL;
+ sbi->active_logs = arg;
+ break;
+ case Opt_disable_ext_identify:
+ set_opt(sbi, DISABLE_EXT_IDENTIFY);
+ break;
+ default:
+ f2fs_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" or missing value",
+ p);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
@@ -112,6 +197,17 @@ static int f2fs_drop_inode(struct inode *inode)
return generic_drop_inode(inode);
}
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+ set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ return;
+}
+
static void f2fs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -170,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb)
{
int err;
- if (sb->s_flags & MS_RDONLY)
+ if (f2fs_readonly(sb))
return 0;
err = f2fs_sync_fs(sb, 1);
@@ -214,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (test_opt(sbi, BG_GC))
- seq_puts(seq, ",background_gc_on");
+ if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
+ seq_printf(seq, ",background_gc=%s", "on");
else
- seq_puts(seq, ",background_gc_off");
+ seq_printf(seq, ",background_gc=%s", "off");
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
@@ -244,11 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_mount_info org_mount_opt;
+ int err, active_logs;
+
+ /*
+ * Save the old mount options in case we
+ * need to restore them.
+ */
+ org_mount_opt = sbi->mount_opt;
+ active_logs = sbi->active_logs;
+
+ /* parse mount options */
+ err = parse_options(sb, data);
+ if (err)
+ goto restore_opts;
+
+ /*
+ * Previous and new state of filesystem is RO,
+ * so no point in checking GC conditions.
+ */
+ if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+ goto skip;
+
+ /*
+ * We stop the GC thread if FS is mounted as RO
+ * or if background_gc = off is passed in mount
+ * option. Also sync the filesystem.
+ */
+ if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if (sbi->gc_thread) {
+ stop_gc_thread(sbi);
+ f2fs_sync_fs(sb, 1);
+ }
+ } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+ err = start_gc_thread(sbi);
+ if (err)
+ goto restore_opts;
+ }
+skip:
+ /* Update the POSIXACL Flag */
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ return 0;
+
+restore_opts:
+ sbi->mount_opt = org_mount_opt;
+ sbi->active_logs = active_logs;
+ return err;
+}
+
static struct super_operations f2fs_sops = {
.alloc_inode = f2fs_alloc_inode,
.drop_inode = f2fs_drop_inode,
.destroy_inode = f2fs_destroy_inode,
.write_inode = f2fs_write_inode,
+ .dirty_inode = f2fs_dirty_inode,
.show_options = f2fs_show_options,
.evict_inode = f2fs_evict_inode,
.put_super = f2fs_put_super,
@@ -256,6 +405,7 @@ static struct super_operations f2fs_sops = {
.freeze_fs = f2fs_freeze,
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
+ .remount_fs = f2fs_remount,
};
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
- char *options)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int arg = 0;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, f2fs_tokens, args);
-
- switch (token) {
- case Opt_gc_background_off:
- clear_opt(sbi, BG_GC);
- break;
- case Opt_disable_roll_forward:
- set_opt(sbi, DISABLE_ROLL_FORWARD);
- break;
- case Opt_discard:
- set_opt(sbi, DISCARD);
- break;
- case Opt_noheap:
- set_opt(sbi, NOHEAP);
- break;
-#ifdef CONFIG_F2FS_FS_XATTR
- case Opt_nouser_xattr:
- clear_opt(sbi, XATTR_USER);
- break;
-#else
- case Opt_nouser_xattr:
- f2fs_msg(sb, KERN_INFO,
- "nouser_xattr options not supported");
- break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
- case Opt_noacl:
- clear_opt(sbi, POSIX_ACL);
- break;
-#else
- case Opt_noacl:
- f2fs_msg(sb, KERN_INFO, "noacl options not supported");
- break;
-#endif
- case Opt_active_logs:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
- return -EINVAL;
- sbi->active_logs = arg;
- break;
- case Opt_disable_ext_identify:
- set_opt(sbi, DISABLE_EXT_IDENTIFY);
- break;
- default:
- f2fs_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
- }
- return 0;
-}
-
static loff_t max_file_size(unsigned bits)
{
loff_t result = ADDRS_PER_INODE;
@@ -541,6 +618,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto free_sb_buf;
}
+ sb->s_fs_info = sbi;
/* init some FS parameters */
sbi->active_logs = NR_CURSEG_TYPE;
@@ -553,7 +631,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi, POSIX_ACL);
#endif
/* parse mount options */
- err = parse_options(sb, sbi, (char *)data);
+ err = parse_options(sb, (char *)data);
if (err)
goto free_sb_buf;
@@ -565,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
- sb->s_fs_info = sbi;
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,10 +751,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
"Cannot recover all fsync data errno=%ld", err);
}
- /* After POR, we can run background GC thread */
- err = start_gc_thread(sbi);
- if (err)
- goto fail;
+ /*
+ * If filesystem is not mounted as read-only then
+ * do start the gc_thread.
+ */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ /* After POR, we can run background GC thread.*/
+ err = start_gc_thread(sbi);
+ if (err)
+ goto fail;
+ }
err = f2fs_build_stats(sbi);
if (err)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce31356..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
*/
#include <linux/rwsem.h>
#include <linux/f2fs_fs.h>
+#include <linux/security.h>
#include "f2fs.h"
#include "xattr.h"
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
prefix = XATTR_TRUSTED_PREFIX;
prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ break;
default:
return -EINVAL;
}
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
total_len = prefix_len + name_len + 1;
if (list && total_len <= list_size) {
memcpy(list, prefix, prefix_len);
- memcpy(list+prefix_len, name, name_len);
+ memcpy(list + prefix_len, name, name_len);
list[prefix_len + name_len] = '\0';
}
return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(dentry->d_inode, type, name,
- buffer, size);
+ return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
}
static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
break;
+ case F2FS_XATTR_INDEX_SECURITY:
+ break;
default:
return -EINVAL;
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+ return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
}
static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
return 0;
}
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *page)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, (struct page *)page);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &f2fs_initxattrs, ipage);
+}
+#endif
+
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
.set = f2fs_xattr_advise_set,
};
+const struct xattr_handler f2fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = F2FS_XATTR_INDEX_SECURITY,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
#endif
[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
};
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
&f2fs_xattr_acl_default_handler,
#endif
&f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+ &f2fs_xattr_security_handler,
+#endif
&f2fs_xattr_advise_handler,
NULL,
};
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
return -ENODATA;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return 0;
page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
base_addr = page_address(page);
list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
}
int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len)
+ const void *value, size_t value_len, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
mark_inode_dirty(inode);
- page = new_node_page(&dn, XATTR_NODE_OFFSET);
+ page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
if (IS_ERR(page)) {
alloc_nid_failed(sbi, fi->i_xattr_nid);
fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
inode->i_ctime = CURRENT_TIME;
clear_inode_flag(fi, FI_ACL_MODE);
}
- update_inode_page(inode);
+ if (ipage)
+ update_inode(inode, ipage);
+ else
+ update_inode_page(inode);
mutex_unlock_op(sbi, ilock);
return 0;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
extern const struct xattr_handler f2fs_xattr_acl_access_handler;
extern const struct xattr_handler f2fs_xattr_acl_default_handler;
extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
extern const struct xattr_handler *f2fs_xattr_handlers[];
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t buffer_size);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
- size_t buffer_size);
-
+extern int f2fs_setxattr(struct inode *, int, const char *,
+ const void *, size_t, struct page *);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#else
#define f2fs_xattr_handlers NULL
static inline int f2fs_setxattr(struct inode *inode, int name_index,
- const char *name, const void *value, size_t value_len)
+ const char *name, const void *value, size_t value_len)
{
return -EOPNOTSUPP;
}
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
}
#endif
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+ const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr, struct page *ipage)
+{
+ return 0;
+}
+#endif
#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..a85ac4e33436 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_work {
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
+ unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
- * I/O completion.
+ * I/O completion. We don't do it for sync(2) writeback because it has a
+ * separate, external IO completion path and ->sync_fs for guaranteeing
+ * inode metadata is written back correctly.
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
@@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
+ .for_sync = work->for_sync,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
@@ -1362,6 +1366,7 @@ void sync_inodes_sb(struct super_block *sb)
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
+ .for_sync = 1,
};
/* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
struct fscache_object, cookie_link);
cache = object->cache;
- if (object->state >= FSCACHE_OBJECT_DYING ||
+ if (fscache_object_is_dying(object) ||
test_bit(FSCACHE_IOERROR, &cache->flags))
cache = NULL;
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
BUG_ON(!ifsdef);
cache->flags = 0;
- ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+ ifsdef->event_mask =
+ ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+ ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
if (!tagname)
tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
{
struct fscache_object *object;
- spin_lock(&cache->object_list_lock);
-
while (!list_empty(&cache->object_list)) {
- object = list_entry(cache->object_list.next,
- struct fscache_object, cache_link);
- list_move_tail(&object->cache_link, dying_objects);
+ spin_lock(&cache->object_list_lock);
- _debug("withdraw %p", object->cookie);
+ if (!list_empty(&cache->object_list)) {
+ object = list_entry(cache->object_list.next,
+ struct fscache_object, cache_link);
+ list_move_tail(&object->cache_link, dying_objects);
- spin_lock(&object->lock);
- spin_unlock(&cache->object_list_lock);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
- spin_unlock(&object->lock);
+ _debug("withdraw %p", object->cookie);
+
+ /* This must be done under object_list_lock to prevent
+ * a race with fscache_drop_object().
+ */
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+ }
+ spin_unlock(&cache->object_list_lock);
cond_resched();
- spin_lock(&cache->object_list_lock);
}
-
- spin_unlock(&cache->object_list_lock);
}
/**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
atomic_set(&cookie->usage, 1);
atomic_set(&cookie->n_children, 0);
+ /* We keep the active count elevated until relinquishment to prevent an
+ * attempt to wake up every time the object operations queue quiesces.
+ */
+ atomic_set(&cookie->n_active, 1);
+
atomic_inc(&parent->usage);
atomic_inc(&parent->n_children);
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
cookie->flags =
(1 << FSCACHE_COOKIE_LOOKING_UP) |
- (1 << FSCACHE_COOKIE_CREATING) |
(1 << FSCACHE_COOKIE_NO_DATA_YET);
/* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
/* initiate the process of looking up all the objects in the chain
* (done by fscache_initialise_object()) */
- fscache_enqueue_object(object);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
spin_unlock(&cookie->lock);
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
object_already_extant:
ret = -ENOBUFS;
- if (object->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dead(object)) {
spin_unlock(&cookie->lock);
goto error;
}
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
ret = -EEXIST;
hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_dying(p))
ret = -ENOBUFS;
goto cant_attach_object;
}
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
hlist_for_each_entry(p, &cookie->parent->backing_objects,
cookie_link) {
if (p->cache == object->cache) {
- if (p->state >= FSCACHE_OBJECT_DYING) {
+ if (fscache_object_is_dying(p)) {
ret = -ENOBUFS;
spin_unlock(&cookie->parent->lock);
goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object,
cookie_link);
- if (object->state < FSCACHE_OBJECT_DYING)
+ if (fscache_object_is_live(object))
fscache_raise_event(
object, FSCACHE_OBJECT_EV_INVALIDATE);
}
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
*/
void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
{
- struct fscache_cache *cache;
struct fscache_object *object;
- unsigned long event;
fscache_stat(&fscache_n_relinquishes);
if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
return;
}
- _enter("%p{%s,%p},%d",
- cookie, cookie->def->name, cookie->netfs_data, retire);
+ _enter("%p{%s,%p,%d},%d",
+ cookie, cookie->def->name, cookie->netfs_data,
+ atomic_read(&cookie->n_active), retire);
+
+ ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
if (atomic_read(&cookie->n_children) != 0) {
printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
BUG();
}
- /* wait for the cookie to finish being instantiated (or to fail) */
- if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
- fscache_stat(&fscache_n_relinquishes_waitcrt);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- }
-
- event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+ /* No further netfs-accessing operations on this cookie permitted */
+ set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+ if (retire)
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
-try_again:
spin_lock(&cookie->lock);
-
- /* break links with all the active objects */
- while (!hlist_empty(&cookie->backing_objects)) {
- int n_reads;
- object = hlist_entry(cookie->backing_objects.first,
- struct fscache_object,
- cookie_link);
-
- _debug("RELEASE OBJ%x", object->debug_id);
-
- set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
- n_reads = atomic_read(&object->n_reads);
- if (n_reads) {
- int n_ops = object->n_ops;
- int n_in_progress = object->n_in_progress;
- spin_unlock(&cookie->lock);
- printk(KERN_ERR "FS-Cache:"
- " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
- cookie->def->name,
- n_reads, n_ops, n_in_progress);
- wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
- fscache_wait_bit, TASK_UNINTERRUPTIBLE);
- printk("Wait finished\n");
- goto try_again;
- }
-
- /* detach each cache object from the object cookie */
- spin_lock(&object->lock);
- hlist_del_init(&object->cookie_link);
-
- cache = object->cache;
- object->cookie = NULL;
- fscache_raise_event(object, event);
- spin_unlock(&object->lock);
-
- if (atomic_dec_and_test(&cookie->usage))
- /* the cookie refcount shouldn't be reduced to 0 yet */
- BUG();
+ hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
}
+ spin_unlock(&cookie->lock);
- /* detach pointers back to the netfs */
+ /* Wait for cessation of activity requiring access to the netfs (when
+ * n_active reaches 0).
+ */
+ if (!atomic_dec_and_test(&cookie->n_active))
+ wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+
+ /* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
cookie->def = NULL;
-
- spin_unlock(&cookie->lock);
+ BUG_ON(cookie->stores.rnode);
if (cookie->parent) {
ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
atomic_dec(&cookie->parent->n_children);
}
- /* finally dispose of the cookie */
+ /* Dispose of the netfs's link to the cookie */
ASSERTCMP(atomic_read(&cookie->usage), >, 0);
fscache_cookie_put(cookie);
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
struct fscache_cookie fscache_fsdef_index = {
.usage = ATOMIC_INIT(1),
+ .n_active = ATOMIC_INIT(1),
.lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
.backing_objects = HLIST_HEAD_INIT,
.def = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
extern int fscache_wait_bit(void *);
extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
/*
* object.c
*/
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-
-extern void fscache_withdrawing_object(struct fscache_cache *,
- struct fscache_object *);
extern void fscache_enqueue_object(struct fscache_object *);
/*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
extern const struct file_operations fscache_objlist_fops;
extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
#else
#define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
#endif
/*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
unsigned event)
{
BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+ printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+ object->debug_id, object->event_mask, (1 << event));
+#endif
if (!test_and_set_bit(event, &object->events) &&
test_bit(event, &object->event_mask))
fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
schedule();
return 0;
}
-EXPORT_SYMBOL(fscache_wait_bit);
/*
* wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
schedule();
return signal_pending(current);
}
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
/* initialise the primary index cookie */
atomic_set(&netfs->primary_index->usage, 1);
atomic_set(&netfs->primary_index->n_children, 0);
+ atomic_set(&netfs->primary_index->n_active, 1);
netfs->primary_index->def = &fscache_fsdef_netfs_def;
netfs->primary_index->parent = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-/**
- * fscache_object_destroy - Note that a cache object is about to be destroyed
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
+/*
+ * Remove an object from the object list.
*/
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
{
write_lock(&fscache_object_list_lock);
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
write_unlock(&fscache_object_list_lock);
}
-EXPORT_SYMBOL(fscache_object_destroy);
/*
* find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
{
struct fscache_objlist_data *data = m->private;
struct fscache_object *obj = v;
+ struct fscache_cookie *cookie;
unsigned long config = data->config;
- uint16_t keylen, auxlen;
char _type[3], *type;
- bool no_cookie;
u8 *buf = data->buf, *p;
if ((unsigned long) v == 1) {
seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
- " EM EV F S"
+ " EM EV FL S"
" | NETFS_COOKIE_DEF TY FL NETFS_DATA");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
if ((unsigned long) v == 2) {
seq_puts(m, "======== ======== ==== ===== === === === == ====="
- " == == = ="
+ " == == == ="
" | ================ == == ================");
if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
} \
} while(0)
+ cookie = obj->cookie;
if (~config) {
- FILTER(obj->cookie,
+ FILTER(cookie->def,
COOKIE, NOCOOKIE);
- FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+ FILTER(fscache_object_is_active(obj) ||
obj->n_ops != 0 ||
obj->n_obj_ops != 0 ||
obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
}
seq_printf(m,
- "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+ "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
obj->debug_id,
obj->parent ? obj->parent->debug_id : -1,
- fscache_object_states_short[obj->state],
+ obj->state->short_name,
obj->n_children,
obj->n_ops,
obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
obj->flags,
work_busy(&obj->work));
- no_cookie = true;
- keylen = auxlen = 0;
- if (obj->cookie) {
- spin_lock(&obj->lock);
- if (obj->cookie) {
- switch (obj->cookie->def->type) {
- case 0:
- type = "IX";
- break;
- case 1:
- type = "DT";
- break;
- default:
- sprintf(_type, "%02u",
- obj->cookie->def->type);
- type = _type;
- break;
- }
+ if (fscache_use_cookie(obj)) {
+ uint16_t keylen = 0, auxlen = 0;
- seq_printf(m, "%-16s %s %2lx %16p",
- obj->cookie->def->name,
- type,
- obj->cookie->flags,
- obj->cookie->netfs_data);
-
- if (obj->cookie->def->get_key &&
- config & FSCACHE_OBJLIST_CONFIG_KEY)
- keylen = obj->cookie->def->get_key(
- obj->cookie->netfs_data,
- buf, 400);
-
- if (obj->cookie->def->get_aux &&
- config & FSCACHE_OBJLIST_CONFIG_AUX)
- auxlen = obj->cookie->def->get_aux(
- obj->cookie->netfs_data,
- buf + keylen, 512 - keylen);
-
- no_cookie = false;
+ switch (cookie->def->type) {
+ case 0:
+ type = "IX";
+ break;
+ case 1:
+ type = "DT";
+ break;
+ default:
+ sprintf(_type, "%02u", cookie->def->type);
+ type = _type;
+ break;
}
- spin_unlock(&obj->lock);
- if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+ seq_printf(m, "%-16s %s %2lx %16p",
+ cookie->def->name,
+ type,
+ cookie->flags,
+ cookie->netfs_data);
+
+ if (cookie->def->get_key &&
+ config & FSCACHE_OBJLIST_CONFIG_KEY)
+ keylen = cookie->def->get_key(cookie->netfs_data,
+ buf, 400);
+
+ if (cookie->def->get_aux &&
+ config & FSCACHE_OBJLIST_CONFIG_AUX)
+ auxlen = cookie->def->get_aux(cookie->netfs_data,
+ buf + keylen, 512 - keylen);
+ fscache_unuse_cookie(obj);
+
+ if (keylen > 0 || auxlen > 0) {
seq_printf(m, " ");
for (p = buf; keylen > 0; keylen--)
seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
seq_printf(m, "%02x", *p++);
}
}
- }
- if (no_cookie)
- seq_printf(m, "<no_cookie>\n");
- else
seq_printf(m, "\n");
+ } else {
+ seq_printf(m, "<no_netfs>\n");
+ }
return 0;
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/prefetch.h>
#include "internal.h"
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
- [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
- [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
- [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
- [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
- [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
- [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
- [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
- [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
- [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
- [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
- [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
- [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
- [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+
+#define __STATE_NAME(n) fscache_osm_##n
+#define STATE(n) (&__STATE_NAME(n))
+
+/*
+ * Define a work state. Work states are execution states. No event processing
+ * is performed by them. The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition. Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = f \
+ }
+
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+
+/*
+ * Define a wait state. Wait states are event processing states. No execution
+ * is performed by them. Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y". The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+ const struct fscache_state __STATE_NAME(n) = { \
+ .name = #n, \
+ .short_name = sn, \
+ .work = NULL, \
+ .transitions = { __VA_ARGS__, { 0, NULL } } \
+ }
+
+#define TRANSIT_TO(state, emask) \
+ { .events = (emask), .transit_to = STATE(state) }
+
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
+
+static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object);
+
+static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL);
+
+static WAIT_STATE(WAIT_FOR_INIT, "?INI",
+ TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_PARENT, "?PRN",
+ TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY));
+
+static WAIT_STATE(WAIT_FOR_CMD, "?CMD",
+ TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
+ TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE),
+ TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR",
+ TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED));
+
+/*
+ * Out-of-band event transition tables. These are for handling unexpected
+ * events, such as an I/O error. If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+ TRANSIT_TO(ABORT_INIT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+ TRANSIT_TO(LOOKUP_FAILURE,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
-EXPORT_SYMBOL(fscache_object_states);
-
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
- [FSCACHE_OBJECT_INIT] = "INIT",
- [FSCACHE_OBJECT_LOOKING_UP] = "LOOK",
- [FSCACHE_OBJECT_CREATING] = "CRTN",
- [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
- [FSCACHE_OBJECT_ACTIVE] = "ACTV",
- [FSCACHE_OBJECT_INVALIDATING] = "INVL",
- [FSCACHE_OBJECT_UPDATING] = "UPDT",
- [FSCACHE_OBJECT_DYING] = "DYNG",
- [FSCACHE_OBJECT_LC_DYING] = "LCDY",
- [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
- [FSCACHE_OBJECT_RELEASING] = "RELS",
- [FSCACHE_OBJECT_RECYCLING] = "RCYC",
- [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
- [FSCACHE_OBJECT_DEAD] = "DEAD",
+
+static const struct fscache_transition fscache_osm_run_oob[] = {
+ TRANSIT_TO(KILL_OBJECT,
+ (1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_KILL)),
+ { 0, NULL }
};
static int fscache_get_object(struct fscache_object *);
static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
static void fscache_dequeue_object(struct fscache_object *);
/*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
object->debug_id, parent->debug_id, parent->n_ops);
spin_lock_nested(&parent->lock, 1);
- parent->n_ops--;
parent->n_obj_ops--;
+ parent->n_ops--;
if (parent->n_ops == 0)
fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
spin_unlock(&parent->lock);
}
/*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
*/
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
{
- if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-}
-
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
- enum fscache_object_state new_state;
- struct fscache_cookie *cookie;
- int event;
+ const struct fscache_transition *t;
+ const struct fscache_state *state, *new_state;
+ unsigned long events, event_mask;
+ int event = -1;
ASSERT(object != NULL);
_enter("{OBJ%x,%s,%lx}",
- object->debug_id, fscache_object_states[object->state],
- object->events);
-
- switch (object->state) {
- /* wait for the parent object to become ready */
- case FSCACHE_OBJECT_INIT:
- object->event_mask =
- FSCACHE_OBJECT_EVENTS_MASK &
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- fscache_initialise_object(object);
- goto done;
-
- /* look up the object metadata on disk */
- case FSCACHE_OBJECT_LOOKING_UP:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* create the object metadata on disk */
- case FSCACHE_OBJECT_CREATING:
- fscache_lookup_object(object);
- goto lookup_transit;
-
- /* handle an object becoming available; start pending
- * operations and queue dependent operations for processing */
- case FSCACHE_OBJECT_AVAILABLE:
- fscache_object_available(object);
- goto active_transit;
-
- /* normal running state */
- case FSCACHE_OBJECT_ACTIVE:
- goto active_transit;
-
- /* Invalidate an object on disk */
- case FSCACHE_OBJECT_INVALIDATING:
- clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
- fscache_stat(&fscache_n_invalidates_run);
- fscache_stat(&fscache_n_cop_invalidate_object);
- fscache_invalidate_object(object);
- fscache_stat_d(&fscache_n_cop_invalidate_object);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
- goto active_transit;
-
- /* update the object metadata on disk */
- case FSCACHE_OBJECT_UPDATING:
- clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
- fscache_stat(&fscache_n_updates_run);
- fscache_stat(&fscache_n_cop_update_object);
- object->cache->ops->update_object(object);
- fscache_stat_d(&fscache_n_cop_update_object);
- goto active_transit;
-
- /* handle an object dying during lookup or creation */
- case FSCACHE_OBJECT_LC_DYING:
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
- fscache_stat(&fscache_n_cop_lookup_complete);
- object->cache->ops->lookup_complete(object);
- fscache_stat_d(&fscache_n_cop_lookup_complete);
-
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DYING;
- cookie = object->cookie;
- if (cookie) {
- if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_LOOKING_UP);
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &cookie->flags))
- wake_up_bit(&cookie->flags,
- FSCACHE_COOKIE_CREATING);
+ object->debug_id, object->state->name, object->events);
+
+ event_mask = object->event_mask;
+restart:
+ object->event_mask = 0; /* Mask normal event handling */
+ state = object->state;
+restart_masked:
+ events = object->events;
+
+ /* Handle any out-of-band events (typically an error) */
+ if (events & object->oob_event_mask) {
+ _debug("{OBJ%x} oob %lx",
+ object->debug_id, events & object->oob_event_mask);
+ for (t = object->oob_table; t->events; t++) {
+ if (events & t->events) {
+ state = t->transit_to;
+ ASSERT(state->work != NULL);
+ event = fls(events & t->events) - 1;
+ __clear_bit(event, &object->oob_event_mask);
+ clear_bit(event, &object->events);
+ goto execute_work_state;
+ }
}
- spin_unlock(&object->lock);
+ }
- fscache_done_parent_op(object);
+ /* Wait states are just transition tables */
+ if (!state->work) {
+ if (events & event_mask) {
+ for (t = state->transitions; t->events; t++) {
+ if (events & t->events) {
+ new_state = t->transit_to;
+ event = fls(events & t->events) - 1;
+ clear_bit(event, &object->events);
+ _debug("{OBJ%x} ev %d: %s -> %s",
+ object->debug_id, event,
+ state->name, new_state->name);
+ object->state = state = new_state;
+ goto execute_work_state;
+ }
+ }
- /* wait for completion of all active operations on this object
- * and the death of all child objects of this object */
- case FSCACHE_OBJECT_DYING:
- dying:
- clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
- spin_lock(&object->lock);
- _debug("dying OBJ%x {%d,%d}",
- object->debug_id, object->n_ops, object->n_children);
- if (object->n_ops == 0 && object->n_children == 0) {
- object->event_mask &=
- ~(1 << FSCACHE_OBJECT_EV_CLEARED);
- object->event_mask |=
- (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR);
- } else {
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- object->event_mask |=
- 1 << FSCACHE_OBJECT_EV_CLEARED;
+ /* The event mask didn't include all the tabled bits */
+ BUG();
}
- spin_unlock(&object->lock);
- fscache_enqueue_dependents(object);
- fscache_start_operations(object);
- goto terminal_transit;
-
- /* handle an abort during initialisation */
- case FSCACHE_OBJECT_ABORT_INIT:
- _debug("handle abort init %lx", object->events);
- object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-
- spin_lock(&object->lock);
- fscache_dequeue_object(object);
-
- object->state = FSCACHE_OBJECT_DYING;
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
- &object->cookie->flags))
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_CREATING);
- spin_unlock(&object->lock);
- goto dying;
-
- /* handle the netfs releasing an object and possibly marking it
- * obsolete too */
- case FSCACHE_OBJECT_RELEASING:
- case FSCACHE_OBJECT_RECYCLING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_release_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* handle the parent cache of this object being withdrawn from
- * active service */
- case FSCACHE_OBJECT_WITHDRAWING:
- object->event_mask &=
- ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_ERROR));
- fscache_withdraw_object(object);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_DEAD;
- spin_unlock(&object->lock);
- fscache_stat(&fscache_n_object_dead);
- goto terminal_transit;
-
- /* complain about the object being woken up once it is
- * deceased */
- case FSCACHE_OBJECT_DEAD:
- printk(KERN_ERR "FS-Cache:"
- " Unexpected event in dead state %lx\n",
- object->events & object->event_mask);
- BUG();
-
- default:
- printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
- object->state);
- BUG();
- }
-
- /* determine the transition from a lookup state */
-lookup_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_LC_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_REQUEUE:
- goto done;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ /* Randomly woke up */
+ goto unmask_events;
}
- /* determine the transition from an active state */
-active_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- case FSCACHE_OBJECT_EV_RETIRE:
- case FSCACHE_OBJECT_EV_RELEASE:
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case FSCACHE_OBJECT_EV_INVALIDATE:
- new_state = FSCACHE_OBJECT_INVALIDATING;
- goto change_state;
- case FSCACHE_OBJECT_EV_UPDATE:
- new_state = FSCACHE_OBJECT_UPDATING;
- goto change_state;
- case -1:
- new_state = FSCACHE_OBJECT_ACTIVE;
- goto change_state; /* sleep until event */
- default:
- goto unsupported_event;
- }
+execute_work_state:
+ _debug("{OBJ%x} exec %s", object->debug_id, state->name);
- /* determine the transition from a terminal state */
-terminal_transit:
- event = fls(object->events & object->event_mask) - 1;
- switch (event) {
- case FSCACHE_OBJECT_EV_WITHDRAW:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RETIRE:
- new_state = FSCACHE_OBJECT_RECYCLING;
- goto change_state;
- case FSCACHE_OBJECT_EV_RELEASE:
- new_state = FSCACHE_OBJECT_RELEASING;
- goto change_state;
- case FSCACHE_OBJECT_EV_ERROR:
- new_state = FSCACHE_OBJECT_WITHDRAWING;
- goto change_state;
- case FSCACHE_OBJECT_EV_CLEARED:
- new_state = FSCACHE_OBJECT_DYING;
- goto change_state;
- case -1:
- goto done; /* sleep until event */
- default:
- goto unsupported_event;
+ new_state = state->work(object, event);
+ event = -1;
+ if (new_state == NO_TRANSIT) {
+ _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+ fscache_enqueue_object(object);
+ event_mask = object->oob_event_mask;
+ goto unmask_events;
}
-change_state:
- spin_lock(&object->lock);
- object->state = new_state;
- spin_unlock(&object->lock);
+ _debug("{OBJ%x} %s -> %s",
+ object->debug_id, state->name, new_state->name);
+ object->state = state = new_state;
-done:
- _leave(" [->%s]", fscache_object_states[object->state]);
- return;
+ if (state->work) {
+ if (unlikely(state->work == ((void *)2UL))) {
+ _leave(" [dead]");
+ return;
+ }
+ goto restart_masked;
+ }
-unsupported_event:
- printk(KERN_ERR "FS-Cache:"
- " Unsupported event %d [%lx/%lx] in state %s\n",
- event, object->events, object->event_mask,
- fscache_object_states[object->state]);
- BUG();
+ /* Transited to wait state */
+ event_mask = object->oob_event_mask;
+ for (t = state->transitions; t->events; t++)
+ event_mask |= t->events;
+
+unmask_events:
+ object->event_mask = event_mask;
+ smp_mb();
+ events = object->events;
+ if (events & event_mask)
+ goto restart;
+ _leave(" [msk %lx]", event_mask);
}
/*
* execute an object
*/
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
{
struct fscache_object *object =
container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
_enter("{OBJ%x}", object->debug_id);
start = jiffies;
- fscache_object_state_machine(object);
+ fscache_object_sm_dispatcher(object);
fscache_hist(fscache_objs_histogram, start);
- if (object->events & object->event_mask)
- fscache_enqueue_object(object);
- clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
fscache_put_object(object);
}
-EXPORT_SYMBOL(fscache_object_work_func);
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+ struct fscache_cookie *cookie,
+ struct fscache_cache *cache)
+{
+ const struct fscache_transition *t;
+
+ atomic_inc(&cache->object_count);
+
+ object->state = STATE(WAIT_FOR_INIT);
+ object->oob_table = fscache_osm_init_oob;
+ object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+ spin_lock_init(&object->lock);
+ INIT_LIST_HEAD(&object->cache_link);
+ INIT_HLIST_NODE(&object->cookie_link);
+ INIT_WORK(&object->work, fscache_object_work_func);
+ INIT_LIST_HEAD(&object->dependents);
+ INIT_LIST_HEAD(&object->dep_link);
+ INIT_LIST_HEAD(&object->pending_ops);
+ object->n_children = 0;
+ object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+ object->events = 0;
+ object->store_limit = 0;
+ object->store_limit_l = 0;
+ object->cache = cache;
+ object->cookie = cookie;
+ object->parent = NULL;
+
+ object->oob_event_mask = 0;
+ for (t = object->oob_table; t->events; t++)
+ object->oob_event_mask |= t->events;
+ object->event_mask = object->oob_event_mask;
+ for (t = object->state->transitions; t->events; t++)
+ object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_event_mask = 0;
+ fscache_dequeue_object(object);
+ return transit_to(KILL_OBJECT);
+}
/*
* initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
* immediately to do a creation
* - we may need to start the process of creating a parent and we need to wait
* for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+ int event)
{
struct fscache_object *parent;
+ bool success;
- _enter("");
- ASSERT(object->cookie != NULL);
- ASSERT(object->cookie->parent != NULL);
-
- if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
- (1 << FSCACHE_OBJECT_EV_RELEASE) |
- (1 << FSCACHE_OBJECT_EV_RETIRE) |
- (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
- _debug("abort init %lx", object->events);
- spin_lock(&object->lock);
- object->state = FSCACHE_OBJECT_ABORT_INIT;
- spin_unlock(&object->lock);
- return;
- }
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->cookie->lock);
- spin_lock_nested(&object->cookie->parent->lock, 1);
+ ASSERT(list_empty(&object->dep_link));
parent = object->parent;
if (!parent) {
- _debug("no parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else {
- spin_lock(&object->lock);
- spin_lock_nested(&parent->lock, 1);
- _debug("parent %s", fscache_object_states[parent->state]);
-
- if (parent->state >= FSCACHE_OBJECT_DYING) {
- _debug("bad parent");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
- _debug("wait");
-
- /* we may get woken up in this state by child objects
- * binding on to us, so we need to make sure we don't
- * add ourself to the list multiple times */
- if (list_empty(&object->dep_link)) {
- fscache_stat(&fscache_n_cop_grab_object);
- object->cache->ops->grab_object(object);
- fscache_stat_d(&fscache_n_cop_grab_object);
- list_add(&object->dep_link,
- &parent->dependents);
-
- /* fscache_acquire_non_index_cookie() uses this
- * to wake the chain up */
- if (parent->state == FSCACHE_OBJECT_INIT)
- fscache_enqueue_object(parent);
- }
- } else {
- _debug("go");
- parent->n_ops++;
- parent->n_obj_ops++;
- object->lookup_jif = jiffies;
- object->state = FSCACHE_OBJECT_LOOKING_UP;
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- }
+ _leave(" [no parent]");
+ return transit_to(DROP_OBJECT);
+ }
- spin_unlock(&parent->lock);
- spin_unlock(&object->lock);
+ _debug("parent: %s of:%lx", parent->state->name, parent->flags);
+
+ if (fscache_object_is_dying(parent)) {
+ _leave(" [bad parent]");
+ return transit_to(DROP_OBJECT);
}
- spin_unlock(&object->cookie->parent->lock);
- spin_unlock(&object->cookie->lock);
+ if (fscache_object_is_available(parent)) {
+ _leave(" [ready]");
+ return transit_to(PARENT_READY);
+ }
+
+ _debug("wait");
+
+ spin_lock(&parent->lock);
+ fscache_stat(&fscache_n_cop_grab_object);
+ success = false;
+ if (fscache_object_is_live(parent) &&
+ object->cache->ops->grab_object(object)) {
+ list_add(&object->dep_link, &parent->dependents);
+ success = true;
+ }
+ fscache_stat_d(&fscache_n_cop_grab_object);
+ spin_unlock(&parent->lock);
+ if (!success) {
+ _leave(" [grab failed]");
+ return transit_to(DROP_OBJECT);
+ }
+
+ /* fscache_acquire_non_index_cookie() uses this
+ * to wake the chain up */
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+ _leave(" [wait]");
+ return transit_to(WAIT_FOR_PARENT);
+}
+
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+ int event)
+{
+ struct fscache_object *parent = object->parent;
+
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ ASSERT(parent != NULL);
+
+ spin_lock(&parent->lock);
+ parent->n_ops++;
+ parent->n_obj_ops++;
+ object->lookup_jif = jiffies;
+ spin_unlock(&parent->lock);
+
_leave("");
+ return transit_to(LOOK_UP_OBJECT);
}
/*
* look an object up in the cache from which it was allocated
* - we hold an "access lock" on the parent object, so the parent object cannot
* be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- * leaf-most cookies of the object and all its children
*/
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+ int event)
{
struct fscache_cookie *cookie = object->cookie;
- struct fscache_object *parent;
+ struct fscache_object *parent = object->parent;
int ret;
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ object->oob_table = fscache_osm_lookup_oob;
- parent = object->parent;
ASSERT(parent != NULL);
ASSERTCMP(parent->n_ops, >, 0);
ASSERTCMP(parent->n_obj_ops, >, 0);
/* make sure the parent is still available */
- ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
-
- if (parent->state >= FSCACHE_OBJECT_DYING ||
- test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
- _debug("unavailable");
- set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
- _leave("");
- return;
+ ASSERT(fscache_object_is_available(parent));
+
+ if (fscache_object_is_dying(parent) ||
+ test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
+ !fscache_use_cookie(object)) {
+ _leave(" [unavailable]");
+ return transit_to(LOOKUP_FAILURE);
}
- _debug("LOOKUP \"%s/%s\" in \"%s\"",
- parent->cookie->def->name, cookie->def->name,
- object->cache->tag->name);
+ _debug("LOOKUP \"%s\" in \"%s\"",
+ cookie->def->name, object->cache->tag->name);
fscache_stat(&fscache_n_object_lookups);
fscache_stat(&fscache_n_cop_lookup_object);
ret = object->cache->ops->lookup_object(object);
fscache_stat_d(&fscache_n_cop_lookup_object);
- if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
- set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ fscache_unuse_cookie(object);
if (ret == -ETIMEDOUT) {
/* probably stuck behind another object, so move this one to
* the back of the queue */
fscache_stat(&fscache_n_object_lookups_timed_out);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ _leave(" [timeout]");
+ return NO_TRANSIT;
}
- _leave("");
+ if (ret < 0) {
+ _leave(" [error]");
+ return transit_to(LOOKUP_FAILURE);
+ }
+
+ _leave(" [ok]");
+ return transit_to(OBJECT_AVAILABLE);
}
/**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_negative);
- /* transit here to allow write requests to begin stacking up
- * and read requests to begin returning ENODATA */
- object->state = FSCACHE_OBJECT_CREATING;
- spin_unlock(&object->lock);
-
- set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+ /* Allow write requests to begin stacking up and read requests to begin
+ * returning ENODATA.
+ */
set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
_debug("wake up lookup %p", &cookie->flags);
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- } else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
- spin_unlock(&object->lock);
}
-
_leave("");
}
EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
{
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x,%s}",
- object->debug_id, fscache_object_states[object->state]);
+ _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
/* if we were still looking up, then we must have a positive lookup
* result, in which case there may be data available */
- spin_lock(&object->lock);
- if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
fscache_stat(&fscache_n_object_lookups_positive);
- clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ /* We do (presumably) have data */
+ clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
-
- smp_mb__before_clear_bit();
- clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
- smp_mb__after_clear_bit();
+ /* Allow write requests to begin stacking up and read requests
+ * to begin shovelling data.
+ */
+ clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
} else {
- ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
fscache_stat(&fscache_n_object_created);
-
- object->state = FSCACHE_OBJECT_AVAILABLE;
- spin_unlock(&object->lock);
- set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
- smp_wmb();
}
- if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
- wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
-
+ set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
_leave("");
}
EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
/*
* handle an object that has just become available
*/
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+ int event)
{
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- spin_lock(&object->lock);
+ object->oob_table = fscache_osm_run_oob;
- if (object->cookie &&
- test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
- wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+ spin_lock(&object->lock);
fscache_done_parent_op(object);
if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
fscache_stat(&fscache_n_cop_lookup_complete);
object->cache->ops->lookup_complete(object);
fscache_stat_d(&fscache_n_cop_lookup_complete);
- fscache_enqueue_dependents(object);
fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
fscache_stat(&fscache_n_object_avail);
_leave("");
+ return transit_to(JUMPSTART_DEPS);
}
/*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
*/
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+ int event)
{
- struct fscache_object *parent = object->parent;
- struct fscache_cache *cache = object->cache;
+ _enter("{OBJ%x},%d", object->debug_id, event);
- _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+ return NO_TRANSIT; /* Not finished; requeue */
+ return transit_to(WAIT_FOR_CMD);
+}
- ASSERTCMP(object->cookie, ==, NULL);
- ASSERT(hlist_unhashed(&object->cookie_link));
+/*
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+ int event)
+{
+ struct fscache_cookie *cookie;
- spin_lock(&cache->object_list_lock);
- list_del_init(&object->cache_link);
- spin_unlock(&cache->object_list_lock);
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_stat(&fscache_n_cop_drop_object);
- cache->ops->drop_object(object);
- fscache_stat_d(&fscache_n_cop_drop_object);
+ object->oob_event_mask = 0;
- if (parent) {
- _debug("release parent OBJ%x {%d}",
- parent->debug_id, parent->n_children);
+ fscache_stat(&fscache_n_cop_lookup_complete);
+ object->cache->ops->lookup_complete(object);
+ fscache_stat_d(&fscache_n_cop_lookup_complete);
- spin_lock(&parent->lock);
- parent->n_children--;
- if (parent->n_children == 0)
- fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
- spin_unlock(&parent->lock);
- object->parent = NULL;
+ cookie = object->cookie;
+ set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+ if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
+ fscache_done_parent_op(object);
+ return transit_to(KILL_OBJECT);
+}
+
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x,%d,%d},%d",
+ object->debug_id, object->n_ops, object->n_children, event);
+
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ object->oob_event_mask = 0;
+
+ if (list_empty(&object->dependents) &&
+ object->n_ops == 0 &&
+ object->n_children == 0)
+ return transit_to(DROP_OBJECT);
+
+ if (object->n_in_progress == 0) {
+ spin_lock(&object->lock);
+ if (object->n_ops > 0 && object->n_in_progress == 0)
+ fscache_start_operations(object);
+ spin_unlock(&object->lock);
}
- /* this just shifts the object release to the work processor */
- fscache_put_object(object);
+ if (!list_empty(&object->dependents))
+ return transit_to(KILL_DEPENDENTS);
- _leave("");
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * release or recycle an object that the netfs has discarded
+ * Kill dependent objects.
*/
-static void fscache_release_object(struct fscache_object *object)
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+ int event)
{
- _enter("");
+ _enter("{OBJ%x},%d", object->debug_id, event);
- fscache_drop_object(object);
+ if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+ return NO_TRANSIT; /* Not finished */
+ return transit_to(WAIT_FOR_CLEARANCE);
}
/*
- * withdraw an object from active service
+ * Drop an object's attachments
*/
-static void fscache_withdraw_object(struct fscache_object *object)
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+ int event)
{
- struct fscache_cookie *cookie;
- bool detached;
+ struct fscache_object *parent = object->parent;
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_cache *cache = object->cache;
+ bool awaken = false;
- _enter("");
+ _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
- spin_lock(&object->lock);
- cookie = object->cookie;
- if (cookie) {
- /* need to get the cookie lock before the object lock, starting
- * from the object pointer */
- atomic_inc(&cookie->usage);
- spin_unlock(&object->lock);
+ ASSERT(cookie != NULL);
+ ASSERT(!hlist_unhashed(&object->cookie_link));
- detached = false;
- spin_lock(&cookie->lock);
- spin_lock(&object->lock);
+ /* Make sure the cookie no longer points here and that the netfs isn't
+ * waiting for us.
+ */
+ spin_lock(&cookie->lock);
+ hlist_del_init(&object->cookie_link);
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ awaken = true;
+ spin_unlock(&cookie->lock);
- if (object->cookie == cookie) {
- hlist_del_init(&object->cookie_link);
- object->cookie = NULL;
- fscache_invalidation_complete(cookie);
- detached = true;
- }
- spin_unlock(&cookie->lock);
- fscache_cookie_put(cookie);
- if (detached)
- fscache_cookie_put(cookie);
- }
+ if (awaken)
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ /* Prevent a race with our last child, which has to signal EV_CLEARED
+ * before dropping our spinlock.
+ */
+ spin_lock(&object->lock);
spin_unlock(&object->lock);
- fscache_drop_object(object);
-}
+ /* Discard from the cache's collection of objects */
+ spin_lock(&cache->object_list_lock);
+ list_del_init(&object->cache_link);
+ spin_unlock(&cache->object_list_lock);
-/*
- * withdraw an object from active service at the behest of the cache
- * - need break the links to a cached object cookie
- * - called under two situations:
- * (1) recycler decides to reclaim an in-use object
- * (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- * simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
- struct fscache_object *object)
-{
- bool enqueue = false;
+ fscache_stat(&fscache_n_cop_drop_object);
+ cache->ops->drop_object(object);
+ fscache_stat_d(&fscache_n_cop_drop_object);
- _enter(",OBJ%x", object->debug_id);
+ /* The parent object wants to know when all it dependents have gone */
+ if (parent) {
+ _debug("release parent OBJ%x {%d}",
+ parent->debug_id, parent->n_children);
- spin_lock(&object->lock);
- if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
- object->state = FSCACHE_OBJECT_WITHDRAWING;
- enqueue = true;
+ spin_lock(&parent->lock);
+ parent->n_children--;
+ if (parent->n_children == 0)
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+ spin_unlock(&parent->lock);
+ object->parent = NULL;
}
- spin_unlock(&object->lock);
- if (enqueue)
- fscache_enqueue_object(object);
+ /* this just shifts the object release to the work processor */
+ fscache_put_object(object);
+ fscache_stat(&fscache_n_object_dead);
_leave("");
+ return transit_to(OBJECT_DEAD);
}
/*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
}
/*
- * discard a ref on a work item
+ * Discard a ref on an object
*/
static void fscache_put_object(struct fscache_object *object)
{
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
fscache_stat_d(&fscache_n_cop_put_object);
}
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+ fscache_objlist_remove(object);
+
+ /* We can get rid of the cookie now */
+ fscache_cookie_put(object->cookie);
+ object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
/*
* enqueue an object for metadata-type processing
*/
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
/**
* fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
*
* Allow an object handler to sleep until the object workqueue is congested.
*
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
/*
- * enqueue the dependents of an object for metadata-type processing
- * - the caller must hold the object's lock
- * - this may cause an already locked object to wind up being processed again
+ * Enqueue the dependents of an object for metadata-type processing.
+ *
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately. We return true if the list was
+ * cleared.
*/
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
{
struct fscache_object *dep;
+ bool ret = true;
_enter("{OBJ%x}", object->debug_id);
if (list_empty(&object->dependents))
- return;
+ return true;
spin_lock(&object->lock);
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
struct fscache_object, dep_link);
list_del_init(&dep->dep_link);
-
- /* sort onto appropriate lists */
- fscache_enqueue_object(dep);
+ fscache_raise_event(dep, event);
fscache_put_object(dep);
- if (!list_empty(&object->dependents))
- cond_resched_lock(&object->lock);
+ if (!list_empty(&object->dependents) && need_resched()) {
+ ret = false;
+ break;
+ }
}
spin_unlock(&object->lock);
+ return ret;
}
/*
* remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
*/
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
{
_enter("{OBJ%x}", object->debug_id);
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
* @data: The auxiliary data for the object
* @datalen: The size of the auxiliary data
*
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
*/
enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
/*
* Asynchronously invalidate an object.
*/
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+ int event)
{
struct fscache_operation *op;
struct fscache_cookie *cookie = object->cookie;
- _enter("{OBJ%x}", object->debug_id);
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ /* We're going to need the cookie. If the cookie is not available then
+ * retire the object instead.
+ */
+ if (!fscache_use_cookie(object)) {
+ ASSERT(object->cookie->stores.rnode == NULL);
+ set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+ _leave(" [no cookie]");
+ return transit_to(KILL_OBJECT);
+ }
/* Reject any new read/write ops and abort any that are pending. */
fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* Now we have to wait for in-progress reads and writes */
op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op) {
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
- _leave(" [ENOMEM]");
- return;
- }
+ if (!op)
+ goto nomem;
fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
- op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+ op->flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_EXCLUSIVE) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
spin_lock(&cookie->lock);
if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
/* We can allow read and write requests to come in once again. They'll
* queue up behind our exclusive invalidation operation.
*/
- fscache_invalidation_complete(cookie);
- _leave("");
- return;
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+ _leave(" [ok]");
+ return transit_to(UPDATE_OBJECT);
+
+nomem:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ fscache_unuse_cookie(object);
+ _leave(" [ENOMEM]");
+ return transit_to(KILL_OBJECT);
submit_op_failed:
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
spin_unlock(&cookie->lock);
kfree(op);
- fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
_leave(" [EIO]");
+ return transit_to(KILL_OBJECT);
+}
+
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+ int event)
+{
+ const struct fscache_state *s;
+
+ fscache_stat(&fscache_n_invalidates_run);
+ fscache_stat(&fscache_n_cop_invalidate_object);
+ s = _fscache_invalidate_object(object, event);
+ fscache_stat_d(&fscache_n_cop_invalidate_object);
+ return s;
+}
+
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+ int event)
+{
+ _enter("{OBJ%x},%d", object->debug_id, event);
+
+ fscache_stat(&fscache_n_updates_run);
+ fscache_stat(&fscache_n_cop_update_object);
+ object->cache->ops->update_object(object);
+ fscache_stat_d(&fscache_n_cop_update_object);
+
+ _leave("");
+ return transit_to(WAIT_FOR_CMD);
}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
- ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+ ASSERT(fscache_object_is_available(op->object));
ASSERTCMP(atomic_read(&op->usage), >, 0);
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
/* need to issue a new write op after this */
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
*/
static void fscache_report_unexpected_submission(struct fscache_object *object,
struct fscache_operation *op,
- unsigned long ostate)
+ const struct fscache_state *ostate)
{
static bool once_only;
struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
once_only = true;
kdebug("unexpected submission OP%x [OBJ%x %s]",
- op->debug_id, object->debug_id,
- fscache_object_states[object->state]);
- kdebug("objstate=%s [%s]",
- fscache_object_states[object->state],
- fscache_object_states[ostate]);
+ op->debug_id, object->debug_id, object->state->name);
+ kdebug("objstate=%s [%s]", object->state->name, ostate->name);
kdebug("objflags=%lx", object->flags);
kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
int fscache_submit_op(struct fscache_object *object,
struct fscache_operation *op)
{
- unsigned long ostate;
+ const struct fscache_state *ostate;
int ret;
_enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
fscache_run_op(object, op);
}
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
op->object = object;
object->n_ops++;
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
ret = 0;
- } else if (object->state == FSCACHE_OBJECT_DYING ||
- object->state == FSCACHE_OBJECT_LC_DYING ||
- object->state == FSCACHE_OBJECT_WITHDRAWING) {
+ } else if (fscache_object_is_dying(object)) {
fscache_stat(&fscache_n_op_rejected);
op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
}
/*
- * jump start the operation processing on an object
- * - caller must hold object->lock
+ * Jump start the operation processing on an object. The caller must hold
+ * object->lock.
*/
void fscache_start_operations(struct fscache_object *object)
{
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
object = op->object;
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
- if (atomic_dec_and_test(&object->n_reads)) {
- clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
- &object->cookie->flags);
- wake_up_bit(&object->cookie->flags,
- FSCACHE_COOKIE_WAITING_ON_READS);
- }
- }
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+ atomic_dec(&object->n_reads);
+ if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+ fscache_unuse_cookie(object);
/* now... we may get called with the object spinlock held, so we
* complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT)) {
+ if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat(&fscache_n_attr_changed_calls);
- if (fscache_object_is_active(object)) {
+ if (fscache_object_is_active(object) &&
+ fscache_use_cookie(object)) {
fscache_stat(&fscache_n_cop_attr_changed);
ret = object->cache->ops->attr_changed(object);
fscache_stat_d(&fscache_n_cop_attr_changed);
+ fscache_unuse_cookie(object);
if (ret < 0)
fscache_abort_object(object);
}
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
_enter("{OP%x}", op->op.debug_id);
- ASSERTCMP(op->n_pages, ==, 0);
+ ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
* allocate a retrieval op
*/
static struct fscache_retrieval *fscache_alloc_retrieval(
+ struct fscache_cookie *cookie,
struct address_space *mapping,
fscache_rw_complete_t end_io_func,
void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
}
fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
- op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+ atomic_inc(&cookie->n_active);
+ op->op.flags = FSCACHE_OP_MYTHREAD |
+ (1UL << FSCACHE_OP_WAITING) |
+ (1UL << FSCACHE_OP_UNUSE_COOKIE);
op->mapping = mapping;
op->end_io_func = end_io_func;
op->context = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
struct fscache_retrieval *op =
container_of(_op, struct fscache_retrieval, op);
- op->n_pages = 0;
+ atomic_set(&op->n_pages, 0);
}
/*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, page->mapping,
+ end_io_func,context);
if (!op) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
- ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+ ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
atomic_inc(&object->n_reads);
__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(mapping, end_io_func, context);
+ op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
if (!op)
return -ENOMEM;
- op->n_pages = *nr_pages;
+ atomic_set(&op->n_pages, *nr_pages);
spin_lock(&cookie->lock);
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
- op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+ op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
- op->n_pages = 1;
+ atomic_set(&op->n_pages, 1);
spin_lock(&cookie->lock);
@@ -675,6 +684,7 @@ error:
nobufs_unlock:
spin_unlock(&cookie->lock);
+ atomic_dec(&cookie->n_active);
kfree(op);
nobufs:
fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
*/
spin_unlock(&object->lock);
fscache_op_complete(&op->op, false);
- _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
- _op->flags, _op->state, object->state, object->flags);
+ _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
+ _op->flags, _op->state, object->state->short_name,
+ object->flags);
return;
}
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
_enter("");
- while (spin_lock(&cookie->stores_lock),
- n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
- ARRAY_SIZE(results),
- FSCACHE_COOKIE_PENDING_TAG),
- n > 0) {
+ for (;;) {
+ spin_lock(&cookie->stores_lock);
+ n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+ ARRAY_SIZE(results),
+ FSCACHE_COOKIE_PENDING_TAG);
+ if (n == 0) {
+ spin_unlock(&cookie->stores_lock);
+ break;
+ }
+
for (i = n - 1; i >= 0; i--) {
page = results[i];
radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
page_cache_release(results[i]);
}
- spin_unlock(&cookie->stores_lock);
_leave("");
}
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
* (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
* set)
*
- * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
- * fill op)
+ * (a) no writes yet
*
* (b) writes deferred till post-creation (mark page for writing and
* return immediately)
*
* (2) negative lookup, object created, initial fill being made from netfs
- * (FSCACHE_COOKIE_INITIAL_FILL is set)
*
* (a) fill point not yet reached this page (mark page for writing and
* return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_operation_init(&op->op, fscache_write_op,
fscache_release_write_op);
- op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+ op->op.flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_WAITING) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
op->store_limit = object->store_limit;
+ atomic_inc(&cookie->n_active);
if (fscache_submit_op(object, &op->op) < 0)
goto submit_failed;
@@ -945,6 +961,7 @@ already_pending:
return 0;
submit_failed:
+ atomic_dec(&cookie->n_active);
spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
spin_unlock(&cookie->stores_lock);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index bfb20a8642c3..5c121fe19c5f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2469,13 +2469,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
.mode = mode
};
int err;
+ bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_PUNCH_HOLE);
if (fc->no_fallocate)
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (lock_inode) {
mutex_lock(&inode->i_mutex);
- fuse_set_nowrite(inode);
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_set_nowrite(inode);
}
req = fuse_get_req_nopages(fc);
@@ -2510,8 +2513,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
fuse_invalidate_attr(inode);
out:
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- fuse_release_nowrite(inode);
+ if (lock_inode) {
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ fuse_release_nowrite(inode);
mutex_unlock(&inode->i_mutex);
}
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5a376ab81feb..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
be found here: http://sources.redhat.com/cluster
The "nolock" lock module is now built in to GFS2 by default. If
- you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
- networking.
+ you want to use the DLM, be sure to enable IPv4/6 networking.
config GFS2_FS_LOCKING_DLM
bool "GFS2 DLM locking"
depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
- HOTPLUG && CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
+ CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
help
Multiple node locking module for GFS2
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_CACHE_SIZE-1);
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
goto out;
}
return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
/* Is the page fully outside i_size? (truncate in progress) */
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0);
+ page->mapping->a_ops->invalidatepage(page, 0,
+ PAGE_CACHE_SIZE);
unlock_page(page);
continue;
}
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
unlock_buffer(bh);
}
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
struct buffer_head *bh, *head;
unsigned long pos = 0;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
goto out;
bh = head = page_buffers(page);
do {
+ if (pos + bh->b_size > stop)
+ return;
+
if (offset <= pos)
gfs2_discard(sdp, bh);
pos += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
out:
- if (offset == 0)
+ if (!partial_page)
try_to_release_page(page, 0);
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93b5809c20bb..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
unstuff = 1;
}
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+ error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+ (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+ 0 : RES_QUOTA), 0);
if (error)
goto do_grow_release;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index e0449c10286a..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1125,13 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
if (IS_ERR(hc))
return PTR_ERR(hc);
- h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+ hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
if (hc2 == NULL)
hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
if (!hc2)
return -ENOMEM;
+ h = hc2;
error = gfs2_meta_inode_buffer(dip, &dibh);
if (error)
goto out_kfree;
@@ -1547,9 +1548,9 @@ out:
/**
* gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
- * @filename:
- * @inode:
+ * @dip: The GFS2 dir inode
+ * @name: The name we are looking up
+ * @fail_on_exist: Fail if the name exists rather than looking it up
*
* This routine searches a directory for a file or another directory.
* Assumes a glock is held on dip.
@@ -1557,22 +1558,25 @@ out:
* Returns: errno
*/
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+ bool fail_on_exist)
{
struct buffer_head *bh;
struct gfs2_dirent *dent;
- struct inode *inode;
+ u64 addr, formal_ino;
+ u16 dtype;
dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
if (dent) {
if (IS_ERR(dent))
return ERR_CAST(dent);
- inode = gfs2_inode_lookup(dir->i_sb,
- be16_to_cpu(dent->de_type),
- be64_to_cpu(dent->de_inum.no_addr),
- be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+ dtype = be16_to_cpu(dent->de_type);
+ addr = be64_to_cpu(dent->de_inum.no_addr);
+ formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
brelse(bh);
- return inode;
+ if (fail_on_exist)
+ return ERR_PTR(-EEXIST);
+ return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
}
return ERR_PTR(-ENOENT);
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ba9000bc1397..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,7 +18,8 @@ struct gfs2_inode;
struct gfs2_inum;
extern struct inode *gfs2_dir_search(struct inode *dir,
- const struct qstr *filename);
+ const struct qstr *filename,
+ bool fail_on_exist);
extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cebfd404c1d9..72c3866a7320 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -531,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
}
/**
- * gfs2_open - open a file
- * @inode: the inode to open
- * @file: the struct file for this opening
+ * gfs2_open_common - This is common to open and atomic_open
+ * @inode: The inode being opened
+ * @file: The file being opened
*
- * Returns: errno
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
*/
-static int gfs2_open(struct inode *inode, struct file *file)
+int gfs2_open_common(struct inode *inode, struct file *file)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder i_gh;
struct gfs2_file *fp;
- int error;
+ int ret;
- fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+ if (S_ISREG(inode->i_mode)) {
+ ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+ }
+
+ fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
if (!fp)
return -ENOMEM;
@@ -553,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
file->private_data = fp;
+ return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+ bool need_unlock = false;
if (S_ISREG(ip->i_inode.i_mode)) {
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (error)
- goto fail;
+ return error;
+ need_unlock = true;
+ }
- if (!(file->f_flags & O_LARGEFILE) &&
- i_size_read(inode) > MAX_NON_LFS) {
- error = -EOVERFLOW;
- goto fail_gunlock;
- }
+ error = gfs2_open_common(inode, file);
+ if (need_unlock)
gfs2_glock_dq_uninit(&i_gh);
- }
-
- return 0;
-fail_gunlock:
- gfs2_glock_dq_uninit(&i_gh);
-fail:
- file->private_data = NULL;
- kfree(fp);
return error;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c97571..5f2e5224c51c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
- sector_t blocknr;
gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
continue;
gfs2_ail_error(gl, bh);
}
- blocknr = bh->b_blocknr;
- bh->b_private = NULL;
- gfs2_remove_from_ail(bd); /* drops ref on bh */
-
- bd->bd_bh = NULL;
- bd->bd_blkno = blocknr;
-
gfs2_trans_add_revoke(sdp, bd);
}
GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 62b484e4a9e4..bbb2715171cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -313,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
goto out;
}
- inode = gfs2_dir_search(dir, name);
+ inode = gfs2_dir_search(dir, name, false);
if (IS_ERR(inode))
error = PTR_ERR(inode);
out:
@@ -346,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
if (!dip->i_inode.i_nlink)
return -ENOENT;
- error = gfs2_dir_check(&dip->i_inode, name, NULL);
- switch (error) {
- case -ENOENT:
- error = 0;
- break;
- case 0:
- return -EEXIST;
- default:
- return error;
- }
-
if (dip->i_entries == (u32)-1)
return -EFBIG;
if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -546,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
* gfs2_create_inode - Create a new inode
* @dir: The parent directory
* @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
* @mode: The permissions on the new inode
* @dev: For device nodes, this is the device number
* @symname: For symlinks, this is the link destination
@@ -555,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
*/
static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+ struct file *file,
umode_t mode, dev_t dev, const char *symname,
- unsigned int size, int excl)
+ unsigned int size, int excl, int *opened)
{
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
@@ -564,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
+ struct dentry *d;
int error;
u32 aflags = 0;
int arq;
@@ -584,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail;
error = create_ok(dip, name, mode);
- if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- gfs2_glock_dq_uninit(ghs);
- d_instantiate(dentry, inode);
- return IS_ERR(inode) ? PTR_ERR(inode) : 0;
- }
if (error)
goto fail_gunlock;
+ inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+ error = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ d = d_splice_alias(inode, dentry);
+ error = 0;
+ if (file && !IS_ERR(d)) {
+ if (d == NULL)
+ d = dentry;
+ if (S_ISREG(inode->i_mode))
+ error = finish_open(file, d, gfs2_open_common, opened);
+ else
+ error = finish_no_open(file, d);
+ }
+ gfs2_glock_dq_uninit(ghs);
+ if (IS_ERR(d))
+ return PTR_RET(d);
+ return error;
+ } else if (error != -ENOENT) {
+ goto fail_gunlock;
+ }
+
arq = error = gfs2_diradd_alloc_required(dir, name);
if (error < 0)
goto fail_gunlock;
@@ -686,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_gunlock3;
mark_inode_dirty(inode);
+ d_instantiate(dentry, inode);
+ if (file)
+ error = finish_open(file, dentry, gfs2_open_common, opened);
gfs2_glock_dq_uninit(ghs);
gfs2_glock_dq_uninit(ghs + 1);
- d_instantiate(dentry, inode);
- return 0;
+ return error;
fail_gunlock3:
gfs2_glock_dq_uninit(ghs + 1);
@@ -729,36 +738,56 @@ fail:
static int gfs2_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
}
/**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
* @dir: The directory inode
* @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
*
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
*
* Returns: errno
*/
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ struct file *file, int *opened)
{
- struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode && !IS_ERR(inode)) {
- struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
- struct gfs2_holder gh;
- int error;
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error) {
- iput(inode);
- return ERR_PTR(error);
- }
- gfs2_glock_dq_uninit(&gh);
+ struct inode *inode;
+ struct dentry *d;
+ struct gfs2_holder gh;
+ struct gfs2_glock *gl;
+ int error;
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ if (!inode)
+ return NULL;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ gl = GFS2_I(inode)->i_gl;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error) {
+ iput(inode);
+ return ERR_PTR(error);
}
- return d_splice_alias(inode, dentry);
+
+ d = d_splice_alias(inode, dentry);
+ if (file && S_ISREG(inode->i_mode))
+ error = finish_open(file, dentry, gfs2_open_common, opened);
+
+ gfs2_glock_dq_uninit(&gh);
+ if (error)
+ return ERR_PTR(error);
+ return d;
+}
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned flags)
+{
+ return __gfs2_lookup(dir, dentry, NULL, NULL);
}
/**
@@ -1076,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
return -ENAMETOOLONG;
- return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
}
/**
@@ -1092,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct gfs2_sbd *sdp = GFS2_SB(dir);
unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
- return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0);
+ return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
}
/**
@@ -1107,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t dev)
{
- return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+ return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned flags,
+ umode_t mode, int *opened)
+{
+ struct dentry *d;
+ bool excl = !!(flags & O_EXCL);
+
+ d = __gfs2_lookup(dir, dentry, file, opened);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ if (d == NULL)
+ d = dentry;
+ if (d->d_inode) {
+ if (!(*opened & FILE_OPENED))
+ return finish_no_open(file, d);
+ return 0;
+ }
+
+ if (!(flags & O_CREAT))
+ return -ENOENT;
+
+ return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
}
/*
@@ -1787,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
.removexattr = gfs2_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
+ .atomic_open = gfs2_atomic_open,
};
const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f4853034..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr, *s;
+ int oldest_tr = 1;
int ret;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
gfs2_ail1_empty_one(sdp, tr);
- if (list_empty(&tr->tr_ail1_list))
+ if (list_empty(&tr->tr_ail1_list) && oldest_tr)
list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
- break;
+ oldest_tr = 0;
}
ret = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
- unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+ unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
unsigned wanted = blks + reserved_blks;
DEFINE_WAIT(wait);
int did_wait = 0;
@@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
spin_unlock(&sdp->sd_ordered_lock);
}
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+ struct buffer_head *bh = bd->bd_bh;
+ struct gfs2_glock *gl = bd->bd_gl;
+
+ gfs2_remove_from_ail(bd);
+ bd->bd_bh = NULL;
+ bh->b_private = NULL;
+ bd->bd_blkno = bh->b_blocknr;
+ bd->bd_ops = &gfs2_revoke_lops;
+ sdp->sd_log_num_revoke++;
+ atomic_inc(&gl->gl_revokes);
+ set_bit(GLF_LFLUSH, &gl->gl_flags);
+ list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+ struct gfs2_bufdata *bd, *tmp;
+ int have_revokes = 0;
+ int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+
+ gfs2_ail1_empty(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (list_empty(&bd->bd_list)) {
+ have_revokes = 1;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock(&sdp->sd_ail_lock);
+ if (have_revokes == 0)
+ return;
+ while (sdp->sd_log_num_revoke > max_revokes)
+ max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+ max_revokes -= sdp->sd_log_num_revoke;
+ if (!sdp->sd_log_num_revoke) {
+ atomic_dec(&sdp->sd_log_blks_free);
+ /* If no blocks have been reserved, we need to also
+ * reserve a block for the header */
+ if (!sdp->sd_log_blks_reserved)
+ atomic_dec(&sdp->sd_log_blks_free);
+ }
+ gfs2_log_lock(sdp);
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+ if (max_revokes == 0)
+ goto out_of_blocks;
+ if (!list_empty(&bd->bd_list))
+ continue;
+ gfs2_add_revoke(sdp, bd);
+ max_revokes--;
+ }
+ }
+out_of_blocks:
+ spin_unlock(&sdp->sd_ail_lock);
+ gfs2_log_unlock(sdp);
+
+ if (!sdp->sd_log_num_revoke) {
+ atomic_inc(&sdp->sd_log_blks_free);
+ if (!sdp->sd_log_blks_reserved)
+ atomic_inc(&sdp->sd_log_blks_free);
+ }
+}
+
/**
* log_write_header - Get and initialize a journal header buffer
* @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
lh = page_address(page);
clear_page(lh);
- gfs2_ail1_empty(sdp);
tail = current_tail(sdp);
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c33d7b6e0c4..17c5b5d7dc88 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/fs.h>
+#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
kunmap_atomic(kaddr);
}
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct gfs2_bufdata *bda, *bdb;
+
+ bda = list_entry(a, struct gfs2_bufdata, bd_list);
+ bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+
+ if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+ return -1;
+ if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+ return 1;
+ return 0;
+}
+
static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
unsigned int total, struct list_head *blist,
bool is_databuf)
@@ -413,6 +428,7 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
__be64 *ptr;
gfs2_log_lock(sdp);
+ list_sort(NULL, blist, blocknr_cmp);
bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
while(total) {
num = total;
@@ -590,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
struct page *page;
unsigned int length;
+ gfs2_write_revokes(sdp);
if (!sdp->sd_log_num_revoke)
return;
@@ -836,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
.lo_name = "revoke",
};
-const struct gfs2_log_operations gfs2_rg_lops = {
- .lo_name = "rg",
-};
-
const struct gfs2_log_operations gfs2_databuf_lops = {
.lo_before_commit = databuf_lo_before_commit,
.lo_after_commit = databuf_lo_after_commit,
@@ -851,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
const struct gfs2_log_operations *gfs2_log_ops[] = {
&gfs2_databuf_lops,
&gfs2_buf_lops,
- &gfs2_rg_lops,
&gfs2_revoke_lops,
NULL,
};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e05c92..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
extern const struct gfs2_log_operations gfs2_glock_lops;
extern const struct gfs2_log_operations gfs2_buf_lops;
extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
extern const struct gfs2_log_operations gfs2_databuf_lops;
extern const struct gfs2_log_operations *gfs2_log_ops[];
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb68472..0da390686c08 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -296,10 +296,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
if (bd) {
spin_lock(&sdp->sd_ail_lock);
if (bd->bd_tr) {
- gfs2_remove_from_ail(bd);
- bh->b_private = NULL;
- bd->bd_bh = NULL;
- bd->bd_blkno = bh->b_blocknr;
gfs2_trans_add_revoke(sdp, bd);
}
spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
goto fail_quotad;
p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start logd thread: %d\n", error);
return error;
}
sdp->sd_logd_process = p;
p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- error = IS_ERR(p);
- if (error) {
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
fs_err(sdp, "can't start quotad thread: %d\n", error);
goto fail;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c253b13722e8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
return error;
}
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
- return gfs2_quota_sync(sb, type);
-}
-
int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
{
struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
&tune->gt_statfs_quantum);
/* Update quota file */
- quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+ quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
/* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9809156e3d04..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1288,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
minlen = max_t(u64, r.minlen,
q->limits.discard_granularity) >> bs_shift;
+ if (end <= start || minlen > sdp->sd_max_rg_data)
+ return -EINVAL;
+
rgd = gfs2_blk2rgrpd(sdp, start, 0);
- rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+ rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
- if (end <= start ||
- minlen > sdp->sd_max_rg_data ||
- start > rgd_end->rd_data0 + rgd_end->rd_data)
- return -EINVAL;
+ if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
+ && (start > rgd_end->rd_data0 + rgd_end->rd_data))
+ return -EINVAL; /* start is beyond the end of the fs */
while (1) {
@@ -1336,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
}
out:
- r.len = trimmed << 9;
+ r.len = trimmed << bs_shift;
if (copy_to_user(argp, &r, sizeof(r)))
return -EFAULT;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7374907742a8..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
- struct gfs2_glock *gl = bd->bd_gl;
struct gfs2_trans *tr = current->journal_info;
BUG_ON(!list_empty(&bd->bd_list));
- BUG_ON(!list_empty(&bd->bd_ail_st_list));
- BUG_ON(!list_empty(&bd->bd_ail_gl_list));
- bd->bd_ops = &gfs2_revoke_lops;
+ gfs2_add_revoke(sdp, bd);
tr->tr_touched = 1;
tr->tr_num_revoke++;
- sdp->sd_log_num_revoke++;
- atomic_inc(&gl->gl_revokes);
- set_bit(GLF_LFLUSH, &gl->gl_flags);
- list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
}
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
* void journal_invalidatepage() - invalidate a journal page
* @journal: journal to use for flush
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: offset of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
*/
void journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
may_free &= journal_unmap_buffer(journal, bh,
- offset > 0);
+ partial_page);
unlock_buffer(bh);
}
curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
config JBD2_DEBUG
bool "JBD2 (ext4) debugging support"
- depends on JBD2 && DEBUG_FS
+ depends on JBD2
help
If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
By default, the debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+ with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
number between 1 and 5. The higher the number, the more debugging
output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+ "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd_space_needed(journal);
- while (__jbd2_log_space_left(journal) < nblocks) {
+ nblocks = jbd2_space_needed(journal);
+ while (jbd2_log_space_left(journal) < nblocks) {
if (journal->j_flags & JBD2_ABORT)
return;
write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
*/
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
- nblocks = jbd_space_needed(journal);
- space_left = __jbd2_log_space_left(journal);
+ nblocks = jbd2_space_needed(journal);
+ space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
/* We were able to recover space; yay! */
;
} else if (tid) {
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set. So we need to temporarily drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
+ write_lock(&journal->j_state_lock);
+ continue;
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
-
- /* Just in case anybody was waiting for more transactions to be
- checkpointed... */
- wake_up(&journal->j_wait_logspace);
ret = 1;
out:
return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
- J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
- J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
#include <trace/events/jbd2.h>
/*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
*/
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
+ struct buffer_head *orig_bh = bh->b_private;
+
BUFFER_TRACE(bh, "");
if (uptodate)
set_buffer_uptodate(bh);
else
clear_buffer_uptodate(bh);
+ if (orig_bh) {
+ clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&orig_bh->b_state, BH_Shadow);
+ }
unlock_buffer(bh);
}
@@ -85,8 +92,7 @@ nope:
__brelse(bh);
}
-static void jbd2_commit_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
{
struct commit_header *h;
__u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+ h = (struct commit_header *)(bh->b_data);
h->h_chksum_type = 0;
h->h_chksum_size = 0;
h->h_chksum[0] = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
h->h_chksum[0] = cpu_to_be32(csum);
}
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head **cbh,
__u32 crc32_sum)
{
- struct journal_head *descriptor;
struct commit_header *tmp;
struct buffer_head *bh;
int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
if (is_journal_aborted(journal))
return 0;
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
- if (!descriptor)
+ bh = jbd2_journal_get_descriptor_buffer(journal);
+ if (!bh)
return 1;
- bh = jh2bh(descriptor);
-
tmp = (struct commit_header *)bh->b_data;
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
- jbd2_commit_block_csum_set(journal, descriptor);
+ jbd2_commit_block_csum_set(journal, bh);
- JBUFFER_TRACE(descriptor, "submit commit block");
+ BUFFER_TRACE(bh, "submit commit block");
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
if (unlikely(!buffer_uptodate(bh)))
ret = -EIO;
put_bh(bh); /* One for getblk() */
- jbd2_journal_put_journal_head(bh2jh(bh));
return ret;
}
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
}
static void jbd2_descr_block_csum_set(journal_t *j,
- struct journal_head *descriptor)
+ struct buffer_head *bh)
{
struct jbd2_journal_block_tail *tail;
__u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_block_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
{
struct page *page = bh->b_page;
__u8 *addr;
- __u32 csum;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
sequence = cpu_to_be32(sequence);
addr = kmap_atomic(page);
- csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
- bh->b_size);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+ bh->b_size);
kunmap_atomic(addr);
- tag->t_checksum = cpu_to_be32(csum);
+ /* We only have space to store the lower 16 bits of the crc32c. */
+ tag->t_checksum = cpu_to_be16(csum32);
}
/*
* jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
{
struct transaction_stats_s stats;
transaction_t *commit_transaction;
- struct journal_head *jh, *new_jh, *descriptor;
+ struct journal_head *jh;
+ struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tid_t first_tid;
int update_tail;
int csum_size = 0;
+ LIST_HEAD(io_bufs);
+ LIST_HEAD(log_bufs);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
trace_jbd2_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ /*
+ * Reserved credits cannot be claimed anymore, free them
+ */
+ atomic_sub(atomic_read(&journal->j_reserved_credits),
+ &commit_transaction->t_outstanding_credits);
+
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(journal, commit_transaction,
- WRITE_SYNC);
+ &log_bufs, WRITE_SYNC);
blk_finish_plug(&plug);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
atomic_read(&commit_transaction->t_outstanding_credits));
err = 0;
- descriptor = NULL;
bufs = 0;
+ descriptor = NULL;
blk_start_plug(&plug);
while (commit_transaction->t_buffers) {
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
record the metadata buffer. */
if (!descriptor) {
- struct buffer_head *bh;
-
J_ASSERT (bufs == 0);
jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- bh = jh2bh(descriptor);
jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
+ (unsigned long long)descriptor->b_blocknr,
+ descriptor->b_data);
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
+ tagp = &descriptor->b_data[sizeof(journal_header_t)];
+ space_left = descriptor->b_size -
+ sizeof(journal_header_t);
first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
+ set_buffer_jwrite(descriptor);
+ set_buffer_dirty(descriptor);
+ wbuf[bufs++] = descriptor;
/* Record it so that we can wait for IO
completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- jbd2_journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+ jbd2_file_log_bh(&log_bufs, descriptor);
}
/* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Bump b_count to prevent truncate from stumbling over
the shadowed buffer! @@@ This can go if we ever get
- rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ rid of the shadow pairing of buffers. */
atomic_inc(&jh2bh(jh)->b_count);
- /* Make a temporary IO buffer with which to write it out
- (this will requeue both the metadata buffer and the
- temporary IO buffer). new_bh goes on BJ_IO*/
-
- set_bit(BH_JWrite, &jh2bh(jh)->b_state);
/*
- * akpm: jbd2_journal_write_metadata_buffer() sets
- * new_bh->b_transaction to commit_transaction.
- * We need to clean this up before we release new_bh
- * (which is of type BJ_IO)
+ * Make a temporary IO buffer with which to write it out
+ * (this will requeue the metadata buffer to BJ_Shadow).
*/
+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
- jh, &new_jh, blocknr);
+ jh, &wbuf[bufs], blocknr);
if (flags < 0) {
jbd2_journal_abort(journal, flags);
continue;
}
- set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
- wbuf[bufs++] = jh2bh(new_jh);
+ jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag = (journal_block_tag_t *) tagp;
write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
- jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+ jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
tagp += tag_bytes;
space_left -= tag_bytes;
+ bufs++;
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
+ the io_bufs list.
Wait for the buffers in reverse order. That way we are
less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
jbd_debug(3, "JBD2: commit phase 3\n");
- /*
- * akpm: these are BJ_IO, and j_list_lock is not needed.
- * See __journal_try_to_free_buffer.
- */
-wait_for_iobuf:
- while (commit_transaction->t_iobuf_list != NULL) {
- struct buffer_head *bh;
+ while (!list_empty(&io_bufs)) {
+ struct buffer_head *bh = list_entry(io_bufs.prev,
+ struct buffer_head,
+ b_assoc_buffers);
- jh = commit_transaction->t_iobuf_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_iobuf;
- }
- if (cond_resched())
- goto wait_for_iobuf;
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
-
- clear_buffer_jwrite(bh);
-
- JBUFFER_TRACE(jh, "ph4: unfile after journal write");
- jbd2_journal_unfile_buffer(journal, jh);
+ jbd2_unfile_log_bh(bh);
/*
- * ->t_iobuf_list should contain only dummy buffer_heads
- * which were created by jbd2_journal_write_metadata_buffer().
+ * The list contains temporary buffer heads created by
+ * jbd2_journal_write_metadata_buffer().
*/
BUFFER_TRACE(bh, "dumping temporary bh");
- jbd2_journal_put_journal_head(jh);
__brelse(bh);
J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
free_buffer_head(bh);
- /* We also have to unlock and free the corresponding
- shadowed buffer */
+ /* We also have to refile the corresponding shadowed buffer */
jh = commit_transaction->t_shadow_list->b_tprev;
bh = jh2bh(jh);
- clear_bit(BH_JWrite, &bh->b_state);
+ clear_buffer_jwrite(bh);
J_ASSERT_BH(bh, buffer_jbddirty(bh));
+ J_ASSERT_BH(bh, !buffer_shadow(bh));
/* The metadata is now released for reuse, but we need
to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /*
- * Wake up any transactions which were waiting for this IO to
- * complete. The barrier must be here so that changes by
- * jbd2_journal_file_buffer() take effect before wake_up_bit()
- * does the waitqueue check.
- */
- smp_mb();
- wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
}
@@ -883,26 +862,19 @@ wait_for_iobuf:
jbd_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
+ while (!list_empty(&log_bufs)) {
struct buffer_head *bh;
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
+ bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
- jbd2_journal_unfile_buffer(journal, jh);
- jbd2_journal_put_journal_head(jh);
+ jbd2_unfile_log_bh(bh);
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -952,9 +924,7 @@ wait_for_iobuf:
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
- J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
- J_ASSERT(commit_transaction->t_log_list == NULL);
restart_loop:
/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (level > jbd2_journal_enable_debug)
+ return;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
/* Checksumming functions */
int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
*
* If the source buffer has already been modified by a new transaction
* since we took the last commit snapshot, we use the frozen copy of
- * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
*
- * The function returns a pointer to the buffer_heads to be used for IO.
- *
- * We assume that the journal has already been locked in this function.
+ * The function returns a pointer to the buffer_head to be used for IO.
+ *
*
* Return value:
* <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
- struct journal_head **jh_out,
- unsigned long long blocknr)
+ struct buffer_head **bh_out,
+ sector_t blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct journal_head *new_jh;
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+ jbd_lock_bh_state(bh_in);
+repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
- jbd_lock_bh_state(bh_in);
-repeat:
if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
- jbd2_journal_put_journal_head(new_jh);
+ brelse(new_bh);
return -ENOMEM;
}
jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
jh_in->b_frozen_data = tmp;
mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+ memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
kunmap_atomic(mapped_data);
new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
}
set_bh_page(new_bh, new_page, new_offset);
- new_jh->b_transaction = NULL;
- new_bh->b_size = jh2bh(jh_in)->b_size;
- new_bh->b_bdev = transaction->t_journal->j_dev;
+ new_bh->b_size = bh_in->b_size;
+ new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
+ new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
- *jh_out = new_jh;
+ *bh_out = new_bh;
/*
* The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
+ set_buffer_shadow(bh_in);
jbd_unlock_bh_state(bh_in);
- JBUFFER_TRACE(new_jh, "file as BJ_IO");
- jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
-
return do_escape | (done_copy_out << 1);
}
@@ -484,35 +496,6 @@ repeat:
*/
/*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __jbd2_log_space_left(journal_t *journal)
-{
- int left = journal->j_free;
-
- /* assert_spin_locked(&journal->j_state_lock); */
-
- /*
- * Be pessimistic here about the number of those free blocks which
- * might be required for log descriptor control blocks.
- */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
- left -= MIN_LOG_RESERVED_BLOCKS;
-
- if (left <= 0)
- return 0;
- left -= (left >> 3);
- return left;
-}
-
-/*
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
}
/*
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
+ * Force and wait any uncommitted transactions. We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ * 0 if nothing to commit,
+ * 1 if transaction was successfully committed.
*/
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
{
transaction_t *transaction = NULL;
tid_t tid;
- int need_to_start = 0;
+ int need_to_start = 0, ret = 0;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
transaction = journal->j_committing_transaction;
if (!transaction) {
+ /* Nothing to commit */
read_unlock(&journal->j_state_lock);
- return 0; /* Nothing to retry */
+ return 0;
}
-
tid = transaction->t_tid;
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
- jbd2_log_wait_commit(journal, tid);
- return 1;
+ ret = jbd2_log_wait_commit(journal, tid);
+ if (!ret)
+ ret = 1;
+
+ return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction. This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+ int ret;
+
+ ret = __jbd2_journal_force_commit(journal);
+ return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+ int ret;
+
+ J_ASSERT(!current->journal_info);
+ ret = __jbd2_journal_force_commit(journal);
+ if (ret > 0)
+ ret = 0;
+ return ret;
}
/*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
- return jbd2_journal_add_journal_head(bh);
+ return bh;
}
/*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
return NULL;
init_waitqueue_head(&journal->j_wait_transaction_locked);
- init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
- init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
+ init_waitqueue_head(&journal->j_wait_reserved);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
+ atomic_set(&journal->j_reserved_credits, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
static void jbd2_write_superblock(journal_t *journal, int write_op)
{
struct buffer_head *bh = journal->j_sb_buffer;
+ journal_superblock_t *sb = journal->j_superblock;
int ret;
trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
+ jbd2_superblock_csum_set(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
journal->j_errno);
sb->s_errno = cpu_to_be32(journal->j_errno);
- jbd2_superblock_csum_set(journal, sb);
read_unlock(&journal->j_state_lock);
jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
#ifdef CONFIG_JBD2_DEBUG
atomic_inc(&nr_journal_heads);
#endif
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
jbd_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
while (!ret) {
yield();
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
}
}
return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
struct journal_head *new_jh = NULL;
repeat:
- if (!buffer_jbd(bh)) {
+ if (!buffer_jbd(bh))
new_jh = journal_alloc_journal_head();
- memset(new_jh, 0, sizeof(*new_jh));
- }
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
- __u32 provided, calculated;
+ __u32 csum32;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
sequence = cpu_to_be32(sequence);
- calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
- calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
- provided = be32_to_cpu(tag->t_checksum);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+ sizeof(sequence));
+ csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- return provided == cpu_to_be32(calculated);
+ return tag->t_checksum == cpu_to_be16(csum32);
}
static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
- struct journal_head **, int *,
+ struct list_head *,
+ struct buffer_head **, int *,
struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
*/
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction,
+ struct list_head *log_bufs,
int write_op)
{
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
- write_one_revoke_record(journal, transaction,
+ write_one_revoke_record(journal, transaction, log_bufs,
&descriptor, &offset,
record, write_op);
count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
- struct journal_head **descriptorp,
+ struct list_head *log_bufs,
+ struct buffer_head **descriptorp,
int *offsetp,
struct jbd2_revoke_record_s *record,
int write_op)
{
int csum_size = 0;
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
int offset;
journal_header_t *header;
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
- header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
- JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
- jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "file in log_bufs");
+ jbd2_file_log_bh(log_bufs, descriptor);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
- * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
offset += 8;
} else {
- * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
}
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
-static void jbd2_revoke_csum_set(journal_t *j,
- struct journal_head *descriptor)
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
{
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- tail = (struct jbd2_journal_revoke_tail *)
- (jh2bh(descriptor)->b_data + j->j_blocksize -
+ tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_revoke_tail));
tail->r_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
- j->j_blocksize);
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->r_checksum = cpu_to_be32(csum);
}
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
*/
static void flush_descriptor(journal_t *journal,
- struct journal_head *descriptor,
+ struct buffer_head *descriptor,
int offset, int write_op)
{
jbd2_journal_revoke_header_t *header;
- struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
- put_bh(bh);
+ put_bh(descriptor);
return;
}
- header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
jbd2_revoke_csum_set(journal, descriptor);
- set_buffer_jwrite(bh);
- BUFFER_TRACE(bh, "write");
- set_buffer_dirty(bh);
- write_dirty_buffer(bh, write_op);
+ set_buffer_jwrite(descriptor);
+ BUFFER_TRACE(descriptor, "write");
+ set_buffer_dirty(descriptor);
+ write_dirty_buffer(descriptor, write_op);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
- atomic_set(&transaction->t_outstanding_credits, 0);
+ atomic_set(&transaction->t_outstanding_credits,
+ atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
}
/*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+ __releases(journal->j_state_lock)
+{
+ DEFINE_WAIT(wait);
+ int need_to_start;
+ tid_t tid = journal->j_running_transaction->t_tid;
+
+ prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ need_to_start = !tid_geq(journal->j_commit_request, tid);
+ read_unlock(&journal->j_state_lock);
+ if (need_to_start)
+ jbd2_log_start_commit(journal, tid);
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+ atomic_sub(blocks, &journal->j_reserved_credits);
+ wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction. Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+ int rsv_blocks)
+{
+ transaction_t *t = journal->j_running_transaction;
+ int needed;
+ int total = blocks + rsv_blocks;
+
+ /*
+ * If the current transaction is locked down for commit, wait
+ * for the lock to be released.
+ */
+ if (t->t_state == T_LOCKED) {
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * If there is not enough space left in the log to write all
+ * potential buffers requested by this operation, we need to
+ * stall pending a log checkpoint to free some more log space.
+ */
+ needed = atomic_add_return(total, &t->t_outstanding_credits);
+ if (needed > journal->j_max_transaction_buffers) {
+ /*
+ * If the current transaction is already too large,
+ * then start to commit it: we can then go back and
+ * attach this handle to a new transaction.
+ */
+ atomic_sub(total, &t->t_outstanding_credits);
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * The commit code assumes that it can get enough log space
+ * without forcing a checkpoint. This is *critical* for
+ * correctness: a checkpoint of a buffer which is also
+ * associated with a committing transaction creates a deadlock,
+ * so commit simply cannot force through checkpoints.
+ *
+ * We must therefore ensure the necessary space in the journal
+ * *before* starting to dirty potentially checkpointed buffers
+ * in the new transaction.
+ */
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ __jbd2_log_wait_for_space(journal);
+ write_unlock(&journal->j_state_lock);
+ return 1;
+ }
+
+ /* No reservation? We are done... */
+ if (!rsv_blocks)
+ return 0;
+
+ needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+ /* We allow at most half of a transaction to be reserved */
+ if (needed > journal->j_max_transaction_buffers / 2) {
+ sub_reserved_credits(journal, rsv_blocks);
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) + rsv_blocks
+ <= journal->j_max_transaction_buffers / 2);
+ return 1;
+ }
+ return 0;
+}
+
+/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
* to begin. Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- tid_t tid;
- int needed, need_to_start;
- int nblocks = handle->h_buffer_credits;
+ int blocks = handle->h_buffer_credits;
+ int rsv_blocks = 0;
unsigned long ts = jiffies;
- if (nblocks > journal->j_max_transaction_buffers) {
+ /*
+ * 1/2 of transaction can be reserved so we can practically handle
+ * only 1/2 of maximum transaction size per operation
+ */
+ if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
- current->comm, nblocks,
- journal->j_max_transaction_buffers);
+ current->comm, blocks,
+ journal->j_max_transaction_buffers / 2);
return -ENOSPC;
}
+ if (handle->h_rsv_handle)
+ rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
alloc_transaction:
if (!journal->j_running_transaction) {
new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
return -EROFS;
}
- /* Wait on the journal's transaction barrier if necessary */
- if (journal->j_barrier_count) {
+ /*
+ * Wait on the journal's transaction barrier if necessary. Specifically
+ * we allow reserved handles to proceed because otherwise commit could
+ * deadlock on page writeback not being able to complete.
+ */
+ if (!handle->h_reserved && journal->j_barrier_count) {
read_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_transaction_locked,
journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
goto alloc_transaction;
write_lock(&journal->j_state_lock);
if (!journal->j_running_transaction &&
- !journal->j_barrier_count) {
+ (handle->h_reserved || !journal->j_barrier_count)) {
jbd2_get_transaction(journal, new_transaction);
new_transaction = NULL;
}
@@ -223,85 +340,18 @@ repeat:
transaction = journal->j_running_transaction;
- /*
- * If the current transaction is locked down for commit, wait for the
- * lock to be released.
- */
- if (transaction->t_state == T_LOCKED) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&journal->j_wait_transaction_locked,
- &wait, TASK_UNINTERRUPTIBLE);
- read_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * If there is not enough space left in the log to write all potential
- * buffers requested by this operation, we need to stall pending a log
- * checkpoint to free some more log space.
- */
- needed = atomic_add_return(nblocks,
- &transaction->t_outstanding_credits);
-
- if (needed > journal->j_max_transaction_buffers) {
+ if (!handle->h_reserved) {
+ /* We may have dropped j_state_lock - restart in that case */
+ if (add_transaction_credits(journal, blocks, rsv_blocks))
+ goto repeat;
+ } else {
/*
- * If the current transaction is already too large, then start
- * to commit it: we can then go back and attach this handle to
- * a new transaction.
+ * We have handle reserved so we are allowed to join T_LOCKED
+ * transaction and we don't have to check for transaction size
+ * and journal space.
*/
- DEFINE_WAIT(wait);
-
- jbd_debug(2, "Handle %p starting new commit...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
- TASK_UNINTERRUPTIBLE);
- tid = transaction->t_tid;
- need_to_start = !tid_geq(journal->j_commit_request, tid);
- read_unlock(&journal->j_state_lock);
- if (need_to_start)
- jbd2_log_start_commit(journal, tid);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * The commit code assumes that it can get enough log space
- * without forcing a checkpoint. This is *critical* for
- * correctness: a checkpoint of a buffer which is also
- * associated with a committing transaction creates a deadlock,
- * so commit simply cannot force through checkpoints.
- *
- * We must therefore ensure the necessary space in the journal
- * *before* starting to dirty potentially checkpointed buffers
- * in the new transaction.
- *
- * The worst part is, any transaction currently committing can
- * reduce the free space arbitrarily. Be careful to account for
- * those buffers when checkpointing.
- */
-
- /*
- * @@@ AKPM: This seems rather over-defensive. We're giving commit
- * a _lot_ of headroom: 1/4 of the journal plus the size of
- * the committing transaction. Really, we only need to give it
- * committing_transaction->t_outstanding_credits plus "enough" for
- * the log control blocks.
- * Also, this test is inconsistent with the matching one in
- * jbd2_journal_extend().
- */
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
- jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- read_unlock(&journal->j_state_lock);
- write_lock(&journal->j_state_lock);
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
- __jbd2_log_wait_for_space(journal);
- write_unlock(&journal->j_state_lock);
- goto repeat;
+ sub_reserved_credits(journal, blocks);
+ handle->h_reserved = 0;
}
/* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
- handle->h_requested_credits = nblocks;
+ handle->h_requested_credits = blocks;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
- handle, nblocks,
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
- __jbd2_log_space_left(journal));
+ jbd2_log_space_left(journal));
read_unlock(&journal->j_state_lock);
+ current->journal_info = handle;
lock_map_acquire(&handle->h_lockdep_map);
jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
*
* We make sure that the transaction can guarantee at least nblocks of
* modified buffers in the log. We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
*
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
- unsigned int type, unsigned int line_no)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+ gfp_t gfp_mask, unsigned int type,
+ unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
+ if (rsv_blocks) {
+ handle_t *rsv_handle;
- current->journal_info = handle;
+ rsv_handle = new_handle(rsv_blocks);
+ if (!rsv_handle) {
+ jbd2_free_handle(handle);
+ return ERR_PTR(-ENOMEM);
+ }
+ rsv_handle->h_reserved = 1;
+ rsv_handle->h_journal = journal;
+ handle->h_rsv_handle = rsv_handle;
+ }
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
- current->journal_info = NULL;
return ERR_PTR(err);
}
handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ journal_t *journal = handle->h_journal;
+
+ WARN_ON(!handle->h_reserved);
+ sub_reserved_credits(journal, handle->h_buffer_credits);
+ jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+ unsigned int line_no)
+{
+ journal_t *journal = handle->h_journal;
+ int ret = -EIO;
+
+ if (WARN_ON(!handle->h_reserved)) {
+ /* Someone passed in normal handle? Just stop it. */
+ jbd2_journal_stop(handle);
+ return ret;
+ }
+ /*
+ * Usefulness of mixing of reserved and unreserved handles is
+ * questionable. So far nobody seems to need it so just error out.
+ */
+ if (WARN_ON(current->journal_info)) {
+ jbd2_journal_free_reserved(handle);
+ return ret;
+ }
+
+ handle->h_journal = NULL;
+ /*
+ * GFP_NOFS is here because callers are likely from writeback or
+ * similarly constrained call sites
+ */
+ ret = start_this_handle(journal, handle, GFP_NOFS);
+ if (ret < 0)
+ jbd2_journal_free_reserved(handle);
+ handle->h_type = type;
+ handle->h_line_no = line_no;
+ return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
* int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
int jbd2_journal_extend(handle_t *handle, int nblocks)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
int result;
int wanted;
- result = -EIO;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
result = 1;
read_lock(&journal->j_state_lock);
/* Don't extend a locked-down transaction! */
- if (handle->h_transaction->t_state != T_RUNNING) {
+ if (transaction->t_state != T_RUNNING) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
spin_lock(&transaction->t_handle_lock);
- wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+ wanted = atomic_add_return(nblocks,
+ &transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
- if (wanted > __jbd2_log_space_left(journal)) {
+ if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+ jbd2_log_space_left(journal)) {
jbd_debug(3, "denied handle %p %d blocks: "
"insufficient log space\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
handle->h_buffer_credits,
nblocks);
handle->h_buffer_credits += nblocks;
handle->h_requested_credits += nblocks;
- atomic_add(nblocks, &transaction->t_outstanding_credits);
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
-out:
return result;
}
@@ -490,19 +615,22 @@ out:
* to a running handle, a call to jbd2_journal_restart will commit the
* handle's transaction so far and reattach the handle to a new
* transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
*/
int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
tid_t tid;
int need_to_start, ret;
+ WARN_ON(!transaction);
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
+ journal = transaction->t_journal;
/*
* First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
spin_lock(&transaction->t_handle_lock);
atomic_sub(handle->h_buffer_credits,
&transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle) {
+ sub_reserved_credits(journal,
+ handle->h_rsv_handle->h_buffer_credits);
+ }
if (atomic_dec_and_test(&transaction->t_updates))
wake_up(&journal->j_wait_updates);
+ tid = transaction->t_tid;
spin_unlock(&transaction->t_handle_lock);
+ handle->h_transaction = NULL;
+ current->journal_info = NULL;
jbd_debug(2, "restarting handle %p\n", handle);
- tid = transaction->t_tid;
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
++journal->j_barrier_count;
+ /* Wait until there are no reserved handles */
+ if (atomic_read(&journal->j_reserved_credits)) {
+ write_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) == 0);
+ write_lock(&journal->j_state_lock);
+ }
+
/* Wait until there are no running updates */
while (1) {
transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
+static int sleep_on_shadow_bh(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int force_copy)
{
struct buffer_head *bh;
- transaction_t *transaction;
+ transaction_t *transaction = handle->h_transaction;
journal_t *journal;
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
unsigned long start_lock, time_lock;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
-
- transaction = handle->h_transaction;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
* journaled. If the primary copy is already going to
* disk then we cannot do copy-out here. */
- if (jh->b_jlist == BJ_Shadow) {
- DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
- wait_queue_head_t *wqh;
-
- wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+ if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
jbd_unlock_bh_state(bh);
- /* commit wakes up all shadow buffers after IO */
- for ( ; ; ) {
- prepare_to_wait(wqh, &wait.wait,
- TASK_UNINTERRUPTIBLE);
- if (jh->b_jlist != BJ_Shadow)
- break;
- schedule();
- }
- finish_wait(wqh, &wait.wait);
+ wait_on_bit(&bh->b_state, BH_Shadow,
+ sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
goto repeat;
}
- /* Only do the copy if the currently-owning transaction
- * still needs it. If it is on the Forget list, the
- * committing transaction is past that stage. The
- * buffer had better remain locked during the kmalloc,
- * but that should be true --- we hold the journal lock
- * still and the buffer is already on the BUF_JOURNAL
- * list so won't be flushed.
+ /*
+ * Only do the copy if the currently-owning transaction still
+ * needs it. If buffer isn't on BJ_Metadata list, the
+ * committing transaction is past that stage (here we use the
+ * fact that BH_Shadow is set under bh_state lock together with
+ * refiling to BJ_Shadow list and at this point we know the
+ * buffer doesn't have BH_Shadow set).
*
* Subtle point, though: if this is a get_undo_access,
* then we will be relying on the frozen_data to contain
* the new value of the committed_data record after the
* transaction, so we HAVE to force the frozen_data copy
- * in that case. */
-
- if (jh->b_jlist != BJ_Forget || force_copy) {
+ * in that case.
+ */
+ if (jh->b_jlist == BJ_Metadata || force_copy) {
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
jbd_debug(5, "journal_head %p\n", jh);
+ WARN_ON(!transaction);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
+ journal = transaction->t_journal;
err = 0;
JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int ret = 0;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
jh = jbd2_journal_grab_journal_head(bh);
if (!jh) {
ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
int was_modified = 0;
+ WARN_ON(!transaction);
+ if (is_handle_aborted(handle))
+ return -EROFS;
+ journal = transaction->t_journal;
+
BUFFER_TRACE(bh, "entry");
jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
*/
jh->b_modified = 0;
- if (jh->b_transaction == handle->h_transaction) {
+ if (jh->b_transaction == transaction) {
J_ASSERT_JH(jh, !jh->b_frozen_data);
/* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
int jbd2_journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int err, wait_for_commit = 0;
+ journal_t *journal;
+ int err = 0, wait_for_commit = 0;
tid_t tid;
pid_t pid;
+ if (!transaction)
+ goto free_and_exit;
+ journal = transaction->t_journal;
+
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
- else {
+ else
J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- err = 0;
- }
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- handle->h_transaction->t_tid,
+ transaction->t_tid,
handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
lock_map_release(&handle->h_lockdep_map);
+ if (handle->h_rsv_handle)
+ jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
jbd2_free_handle(handle);
return err;
}
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk. May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
- handle_t *handle;
- int ret;
-
- handle = jbd2_journal_start(journal, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- } else {
- handle->h_sync = 1;
- ret = jbd2_journal_stop(handle);
- }
- return ret;
-}
-
/*
*
* List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
- * of these pointers, it could go bad. Generally the caller needs to re-read
- * the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list. If the caller is holding onto a copy of one of these
+ * pointers, it could go bad. Generally the caller needs to re-read the
+ * pointer from the transaction_t.
*
* Called under j_list_lock.
*/
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
* void jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
- * @offset: length of page to invalidate.
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
- * Reap page buffers containing data after offset in page. Can return -EBUSY
- * if buffers are part of the committing transaction and the page is straddling
- * i_size. Caller then has to wait for current commit and try again.
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
*/
int jbd2_journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
int ret = 0;
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return 0;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return 0;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- ret = journal_unmap_buffer(journal, bh, offset > 0);
+ ret = journal_unmap_buffer(journal, bh, partial_page);
unlock_buffer(bh);
if (ret < 0)
return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- return -EIO;
+ return -EROFS;
+ journal = transaction->t_journal;
jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..9e3aaff11f89 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- BUG_ON(offset);
+ BUG_ON(offset || length < PAGE_CACHE_SIZE);
BUG_ON(PageWriteback(page));
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
return __logfs_writepage(page);
}
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct logfs_block *block = logfs_block(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
return area;
}
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+ unsigned int l)
{
return;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..6b4a79f4ad1d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* - Called if either PG_private or PG_fscache is set on the page
* - Caller holds page lock
*/
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
- dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+ page, offset, length);
- if (offset != 0)
+ if (offset != 0 || length < PAGE_CACHE_SIZE)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page_file_mapping(page)->host, page);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0);
+ block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
unlock_page(page);
ntfs_debug("Write outside i_size - truncated?");
return 0;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..79736a28d84f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
* from ext3. PageChecked() bits have been removed as OCFS2 does not
* do journalled data.
*/
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
- jbd2_journal_invalidatepage(journal, page, offset);
+ jbd2_journal_invalidatepage(journal, page, offset, length);
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f844533792ee..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2975,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
}
/* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
struct inode *inode = page->mapping->host;
unsigned int curr_off = 0;
+ unsigned int stop = offset + length;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int ret = 1;
BUG_ON(!PageLocked(page));
- if (offset == 0)
+ if (!partial_page)
ClearPageChecked(page);
if (!page_has_buffers(page))
@@ -2996,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ goto out;
+
/*
* is this block fully invalidated?
*/
@@ -3014,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (!offset && ret) {
+ if (!partial_page && ret) {
ret = try_to_release_page(page, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
diff --git a/fs/splice.c b/fs/splice.c
index cc53bd04be8f..3b7ee656f3aa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1269,6 +1269,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
* @in: file to splice from
* @ppos: input file offset
* @out: file to splice to
+ * @opos: output file offset
* @len: number of bytes to splice
* @flags: splice modifier flags
*
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4cfd742d260d..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
}
/**
- * sysfs_link_subling - link sysfs_dirent into sibling rbtree
+ * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
* @sd: sysfs_dirent of interest
*
* Link @sd into its sibling rbtree which starts from
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->event);
- wake_up_interruptible(&od->poll);
+ if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
+ od = sd->s_attr.open;
+ if (od) {
+ atomic_inc(&od->event);
+ wake_up_interruptible(&od->poll);
+ }
}
spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
#include <linux/security.h>
#include "sysfs.h"
-extern struct super_block * sysfs_sb;
-
static const struct address_space_operations sysfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
}
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
ubifs_assert(PagePrivate(page));
- if (offset)
+ if (offset || length < PAGE_CACHE_SIZE)
/* Partial page remains dirty */
return;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41a695048be7..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -843,10 +843,12 @@ xfs_cluster_write(
STATIC void
xfs_vm_invalidatepage(
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
- trace_xfs_invalidatepage(page->mapping->host, page, offset);
- block_invalidatepage(page, offset);
+ trace_xfs_invalidatepage(page->mapping->host, page, offset,
+ length);
+ block_invalidatepage(page, offset, length);
}
/*
@@ -910,7 +912,7 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0);
+ xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
return;
}
@@ -940,7 +942,7 @@ xfs_vm_writepage(
int count = 0;
int nonblocking = 0;
- trace_xfs_writepage(inode, page, 0);
+ trace_xfs_writepage(inode, page, 0, 0);
ASSERT(page_has_buffers(page));
@@ -1171,7 +1173,7 @@ xfs_vm_releasepage(
{
int delalloc, unwritten;
- trace_xfs_releasepage(page->mapping->host, page, 0);
+ trace_xfs_releasepage(page->mapping->host, page, 0, 0);
xfs_count_page_state(page, &delalloc, &unwritten);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..a04701de6bbd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -974,14 +974,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
DEFINE_RW_EVENT(xfs_file_splice_write);
DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
- TP_ARGS(inode, page, off),
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+ unsigned int len),
+ TP_ARGS(inode, page, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
+ __field(unsigned int, length)
__field(int, delalloc)
__field(int, unwritten)
),
@@ -995,24 +997,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
__entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
+ __entry->length = len;
__entry->delalloc = delalloc;
__entry->unwritten = unwritten;
),
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "delalloc %d unwritten %d",
+ "length %x delalloc %d unwritten %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pgoff,
__entry->size,
__entry->offset,
+ __entry->length,
__entry->delalloc,
__entry->unwritten)
)
#define DEFINE_PAGE_EVENT(name) \
DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
- TP_ARGS(inode, page, off))
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+ unsigned int len), \
+ TP_ARGS(inode, page, off, len))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);