aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/dir.c10
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/bio.c22
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/backref.c48
-rw-r--r--fs/btrfs/ctree.c1
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/extent-tree.c27
-rw-r--r--fs/btrfs/extent_io.c9
-rw-r--r--fs/btrfs/file.c64
-rw-r--r--fs/btrfs/inode.c54
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c5
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/cachefiles/interface.c26
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/xattr.c36
-rw-r--r--fs/ceph/Kconfig9
-rw-r--r--fs/ceph/Makefile1
-rw-r--r--fs/ceph/addr.c116
-rw-r--r--fs/ceph/cache.c398
-rw-r--r--fs/ceph/cache.h159
-rw-r--r--fs/ceph/caps.c87
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/file.c299
-rw-r--r--fs/ceph/inode.c46
-rw-r--r--fs/ceph/ioctl.c12
-rw-r--r--fs/ceph/mds_client.c34
-rw-r--r--fs/ceph/super.c35
-rw-r--r--fs/ceph/super.h17
-rw-r--r--fs/cifs/cifsencrypt.c14
-rw-r--r--fs/cifs/cifsfs.c11
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/connect.c7
-rw-r--r--fs/cifs/file.c3
-rw-r--r--fs/cifs/link.c84
-rw-r--r--fs/cifs/readdir.c8
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smb1ops.c1
-rw-r--r--fs/cifs/smb2transport.c9
-rw-r--r--fs/dcache.c536
-rw-r--r--fs/debugfs/inode.c69
-rw-r--r--fs/direct-io.c127
-rw-r--r--fs/dlm/ast.c5
-rw-r--r--fs/dlm/user.c25
-rw-r--r--fs/efs/inode.c2
-rw-r--r--fs/eventpoll.c31
-rw-r--r--fs/exec.c4
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/super.c43
-rw-r--r--fs/ext4/balloc.c28
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h70
-rw-r--r--fs/ext4/ext4_extents.h6
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c319
-rw-r--r--fs/ext4/extents_status.c196
-rw-r--r--fs/ext4/extents_status.h51
-rw-r--r--fs/ext4/file.c23
-rw-r--r--fs/ext4/ialloc.c100
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c416
-rw-r--r--fs/ext4/ioctl.c10
-rw-r--r--fs/ext4/mballoc.c60
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c37
-rw-r--r--fs/ext4/page-io.c65
-rw-r--r--fs/ext4/super.c101
-rw-r--r--fs/f2fs/checkpoint.c24
-rw-r--r--fs/f2fs/data.c28
-rw-r--r--fs/f2fs/debug.c34
-rw-r--r--fs/f2fs/dir.c19
-rw-r--r--fs/f2fs/f2fs.h106
-rw-r--r--fs/f2fs/file.c25
-rw-r--r--fs/f2fs/gc.c58
-rw-r--r--fs/f2fs/gc.h38
-rw-r--r--fs/f2fs/inode.c15
-rw-r--r--fs/f2fs/namei.c33
-rw-r--r--fs/f2fs/node.c100
-rw-r--r--fs/f2fs/node.h44
-rw-r--r--fs/f2fs/recovery.c29
-rw-r--r--fs/f2fs/segment.c41
-rw-r--r--fs/f2fs/segment.h6
-rw-r--r--fs/f2fs/super.c209
-rw-r--r--fs/f2fs/xattr.c289
-rw-r--r--fs/f2fs/xattr.h15
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fscache/cookie.c71
-rw-r--r--fs/fscache/internal.h6
-rw-r--r--fs/fscache/page.c71
-rw-r--r--fs/fuse/cuse.c13
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/dir.c133
-rw-r--r--fs/fuse/file.c11
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c3
-rw-r--r--fs/gfs2/aops.c44
-rw-r--r--fs/gfs2/dentry.c9
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/file.c4
-rw-r--r--fs/gfs2/glock.c21
-rw-r--r--fs/gfs2/glops.c18
-rw-r--r--fs/gfs2/inode.c6
-rw-r--r--fs/gfs2/lops.c18
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c18
-rw-r--r--fs/gfs2/meta_io.h26
-rw-r--r--fs/gfs2/ops_fstype.c53
-rw-r--r--fs/hostfs/hostfs_kern.c9
-rw-r--r--fs/hugetlbfs/inode.c18
-rw-r--r--fs/inode.c2
-rw-r--r--fs/internal.h1
-rw-r--r--fs/isofs/inode.c16
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c18
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c5
-rw-r--r--fs/jbd2/recovery.c24
-rw-r--r--fs/jfs/jfs_dtree.c31
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/clntproc.c5
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/namei.c289
-rw-r--r--fs/namespace.c136
-rw-r--r--fs/nfs/Makefile9
-rw-r--r--fs/nfs/callback_proc.c11
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/delegation.c3
-rw-r--r--fs/nfs/dir.c90
-rw-r--r--fs/nfs/file.c20
-rw-r--r--fs/nfs/idmap.c195
-rw-r--r--fs/nfs/inode.c43
-rw-r--r--fs/nfs/internal.h31
-rw-r--r--fs/nfs/nfs3proc.c10
-rw-r--r--fs/nfs/nfs4_fs.h98
-rw-r--r--fs/nfs/nfs4client.c240
-rw-r--r--fs/nfs/nfs4filelayout.c39
-rw-r--r--fs/nfs/nfs4getroot.c4
-rw-r--r--fs/nfs/nfs4namespace.c21
-rw-r--r--fs/nfs/nfs4proc.c801
-rw-r--r--fs/nfs/nfs4session.c86
-rw-r--r--fs/nfs/nfs4session.h37
-rw-r--r--fs/nfs/nfs4state.c174
-rw-r--r--fs/nfs/nfs4super.c2
-rw-r--r--fs/nfs/nfs4trace.c17
-rw-r--r--fs/nfs/nfs4trace.h1148
-rw-r--r--fs/nfs/nfs4xdr.c217
-rw-r--r--fs/nfs/nfstrace.c9
-rw-r--r--fs/nfs/nfstrace.h729
-rw-r--r--fs/nfs/pagelist.c22
-rw-r--r--fs/nfs/pnfs.c3
-rw-r--r--fs/nfs/proc.c6
-rw-r--r--fs/nfs/read.c7
-rw-r--r--fs/nfs/super.c103
-rw-r--r--fs/nfs/unlink.c38
-rw-r--r--fs/nfs/write.c57
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfs4xdr.c19
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfssvc.c13
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/nilfs2/segbuf.c5
-rw-r--r--fs/nilfs2/super.c26
-rw-r--r--fs/ocfs2/aops.c10
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/file.c6
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/refcounttree.c58
-rw-r--r--fs/ocfs2/refcounttree.h6
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/open.c17
-rw-r--r--fs/pnode.h5
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c16
-rw-r--r--fs/proc/root.c10
-rw-r--r--fs/proc/task_mmu.c31
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/pstore/Kconfig2
-rw-r--r--fs/pstore/inode.c10
-rw-r--r--fs/pstore/internal.h5
-rw-r--r--fs/pstore/platform.c212
-rw-r--r--fs/pstore/ram.c47
-rw-r--r--fs/quota/dquot.c46
-rw-r--r--fs/reiserfs/bitmap.c22
-rw-r--r--fs/reiserfs/dir.c7
-rw-r--r--fs/reiserfs/fix_node.c26
-rw-r--r--fs/reiserfs/inode.c114
-rw-r--r--fs/reiserfs/ioctl.c7
-rw-r--r--fs/reiserfs/journal.c104
-rw-r--r--fs/reiserfs/lock.c43
-rw-r--r--fs/reiserfs/namei.c24
-rw-r--r--fs/reiserfs/prints.c5
-rw-r--r--fs/reiserfs/procfs.c99
-rw-r--r--fs/reiserfs/reiserfs.h36
-rw-r--r--fs/reiserfs/resize.c10
-rw-r--r--fs/reiserfs/stree.c74
-rw-r--r--fs/reiserfs/super.c78
-rw-r--r--fs/reiserfs/xattr.c46
-rw-r--r--fs/reiserfs/xattr_acl.c16
-rw-r--r--fs/stat.c11
-rw-r--r--fs/super.c43
-rw-r--r--fs/sysfs/bin.c13
-rw-r--r--fs/sysfs/dir.c61
-rw-r--r--fs/sysfs/file.c82
-rw-r--r--fs/sysfs/group.c158
-rw-r--r--fs/sysfs/inode.c21
-rw-r--r--fs/sysfs/mount.c13
-rw-r--r--fs/sysfs/symlink.c18
-rw-r--r--fs/sysfs/sysfs.h18
-rw-r--r--fs/udf/super.c342
-rw-r--r--fs/xfs/xfs_aops.c28
-rw-r--r--fs/xfs/xfs_aops.h3
224 files changed, 9254 insertions, 3395 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 34494fbead0a..0b74d3176ab7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -685,16 +685,12 @@ not_found:
spin_unlock(&dentry->d_lock);
out_bad:
- if (dentry->d_inode) {
- /* don't unhash if we have submounts */
- if (have_submounts(dentry))
- goto out_skip;
- }
+ /* don't unhash if we have submounts */
+ if (check_submounts_and_drop(dentry) != 0)
+ goto out_skip;
_debug("dropping dentry %s/%s",
parent->d_name.name, dentry->d_name.name);
- shrink_dcache_parent(dentry);
- d_drop(dentry);
dput(parent);
key_put(key);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 5e376bb93419..8defc6b3f9a2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -40,7 +40,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
int block, off;
inode = iget_locked(sb, ino);
- if (IS_ERR(inode))
+ if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
diff --git a/fs/bio.c b/fs/bio.c
index 94bbc04dba77..b3b20ed9510e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1045,12 +1045,22 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
int bio_uncopy_user(struct bio *bio)
{
struct bio_map_data *bmd = bio->bi_private;
- int ret = 0;
+ struct bio_vec *bvec;
+ int ret = 0, i;
- if (!bio_flagged(bio, BIO_NULL_MAPPED))
- ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
- bmd->nr_sgvecs, bio_data_dir(bio) == READ,
- 0, bmd->is_our_pages);
+ if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
+ /*
+ * if we're in a workqueue, the request is orphaned, so
+ * don't copy into a random user address space, just free.
+ */
+ if (current->mm)
+ ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+ bmd->nr_sgvecs, bio_data_dir(bio) == READ,
+ 0, bmd->is_our_pages);
+ else if (bmd->is_our_pages)
+ bio_for_each_segment_all(bvec, bio, i)
+ __free_page(bvec->bv_page);
+ }
bio_free_map_data(bmd);
bio_put(bio);
return ret;
@@ -1946,7 +1956,7 @@ int bio_associate_current(struct bio *bio)
/* associate blkcg if exists */
rcu_read_lock();
- css = task_subsys_state(current, blkio_subsys_id);
+ css = task_css(current, blkio_subsys_id);
if (css && css_tryget(css))
bio->bi_css = css;
rcu_read_unlock();
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c7bda5cd3da7..1173a4ee0830 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1519,7 +1519,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index eaf133384a8f..8bc5e8ccb091 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -36,16 +36,23 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
u64 extent_item_pos,
struct extent_inode_elem **eie)
{
- u64 data_offset;
- u64 data_len;
+ u64 offset = 0;
struct extent_inode_elem *e;
- data_offset = btrfs_file_extent_offset(eb, fi);
- data_len = btrfs_file_extent_num_bytes(eb, fi);
+ if (!btrfs_file_extent_compression(eb, fi) &&
+ !btrfs_file_extent_encryption(eb, fi) &&
+ !btrfs_file_extent_other_encoding(eb, fi)) {
+ u64 data_offset;
+ u64 data_len;
- if (extent_item_pos < data_offset ||
- extent_item_pos >= data_offset + data_len)
- return 1;
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ return 1;
+ offset = extent_item_pos - data_offset;
+ }
e = kmalloc(sizeof(*e), GFP_NOFS);
if (!e)
@@ -53,7 +60,7 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
e->next = *eie;
e->inum = key->objectid;
- e->offset = key->offset + (extent_item_pos - data_offset);
+ e->offset = key->offset + offset;
*eie = e;
return 0;
@@ -189,7 +196,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *eb;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
- struct extent_inode_elem *eie = NULL;
+ struct extent_inode_elem *eie = NULL, *old = NULL;
u64 disk_byte;
if (level != 0) {
@@ -223,6 +230,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (disk_byte == wanted_disk_byte) {
eie = NULL;
+ old = NULL;
if (extent_item_pos) {
ret = check_extent_in_eb(&key, eb, fi,
*extent_item_pos,
@@ -230,18 +238,20 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (ret < 0)
break;
}
- if (!ret) {
- ret = ulist_add(parents, eb->start,
- (uintptr_t)eie, GFP_NOFS);
- if (ret < 0)
- break;
- if (!extent_item_pos) {
- ret = btrfs_next_old_leaf(root, path,
- time_seq);
- continue;
- }
+ if (ret > 0)
+ goto next;
+ ret = ulist_add_merge(parents, eb->start,
+ (uintptr_t)eie,
+ (u64 *)&old, GFP_NOFS);
+ if (ret < 0)
+ break;
+ if (!ret && extent_item_pos) {
+ while (old->next)
+ old = old->next;
+ old->next = eie;
}
}
+next:
ret = btrfs_next_old_item(root, path, time_seq);
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5bf4c39e2ad6..ed504607d8ec 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1271,7 +1271,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
BUG_ON(!eb_rewin);
}
- extent_buffer_get(eb_rewin);
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 4253ad580e39..5f8f3341c099 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -747,7 +747,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
WARN_ON(atomic_xchg(
&fs_info->mutually_exclusive_operation_running, 1));
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
- return PTR_RET(task);
+ return PTR_ERR_OR_ZERO(task);
}
static int btrfs_dev_replace_kthread(void *data)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0236de711989..1204c8ef6f32 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7466,6 +7466,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int err = 0;
int ret;
int level;
+ bool root_dropped = false;
path = btrfs_alloc_path();
if (!path) {
@@ -7523,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
while (1) {
btrfs_tree_lock(path->nodes[level]);
btrfs_set_lock_blocking(path->nodes[level]);
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, root,
path->nodes[level]->start,
@@ -7538,6 +7540,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
break;
btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
WARN_ON(wc->refs[level] != 1);
level--;
}
@@ -7552,11 +7555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
- if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
- pr_debug("btrfs: drop snapshot early exit\n");
- err = -EAGAIN;
- goto out_end_trans;
- }
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
@@ -7584,7 +7582,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
BUG_ON(wc->level == 0);
- if (btrfs_should_end_transaction(trans, tree_root)) {
+ if (btrfs_should_end_transaction(trans, tree_root) ||
+ (!for_reloc && btrfs_need_cleaner_sleep(root))) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
@@ -7595,6 +7594,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
btrfs_end_transaction_throttle(trans, tree_root);
+ if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+ pr_debug("btrfs: drop snapshot early exit\n");
+ err = -EAGAIN;
+ goto out_free;
+ }
+
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -7639,12 +7644,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
free_extent_buffer(root->commit_root);
btrfs_put_fs_root(root);
}
+ root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
btrfs_free_path(path);
out:
+ /*
+ * So if we need to stop dropping the snapshot for whatever reason we
+ * need to make sure to add it back to the dead root list so that we
+ * keep trying to do the work later. This also cleans up roots if we
+ * don't have it in the radix (like when we recover after a power fail
+ * or unmount) so we don't leak memory.
+ */
+ if (root_dropped == false)
+ btrfs_add_dead_root(root);
if (err)
btrfs_std_error(root->fs_info, err);
return err;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 583d98bd065e..fe443fece851 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4048,7 +4048,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
while (!end) {
- u64 offset_in_extent;
+ u64 offset_in_extent = 0;
/* break if the extent we found is outside the range */
if (em->start >= max || extent_map_end(em) < off)
@@ -4064,9 +4064,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/*
* record the offset from the start of the extent
- * for adjusting the disk offset below
+ * for adjusting the disk offset below. Only do this if the
+ * extent isn't compressed since our in ram offset may be past
+ * what we have actually allocated on disk.
*/
- offset_in_extent = em_start - em->start;
+ if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ offset_in_extent = em_start - em->start;
em_end = extent_map_end(em);
em_len = em_end - em_start;
emflags = em->flags;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a005fe2c072a..4d2eb6417145 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -596,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
if (no_splits)
goto next;
- if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- em->start < start) {
+ if (em->start < start) {
split->start = em->start;
split->len = start - em->start;
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->ram_bytes = em->ram_bytes;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
+ } else {
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ split->ram_bytes = split->len;
+ }
+
split->generation = gen;
split->bdev = em->bdev;
split->flags = flags;
@@ -620,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split = split2;
split2 = NULL;
}
- if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- testend && em->start + em->len > start + len) {
+ if (testend && em->start + em->len > start + len) {
u64 diff = start + len - em->start;
split->start = start + len;
@@ -630,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->flags = flags;
split->compress_type = em->compress_type;
split->generation = gen;
- split->orig_block_len = max(em->block_len,
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_block_len = max(em->block_len,
em->orig_block_len);
- split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->block_start = em->block_start;
- split->orig_start = em->orig_start;
+ split->ram_bytes = em->ram_bytes;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->block_start = em->block_start;
+ split->orig_start = em->orig_start;
+ } else {
+ split->block_len = split->len;
+ split->block_start = em->block_start
+ + diff;
+ split->orig_start = em->orig_start;
+ }
} else {
- split->block_len = split->len;
- split->block_start = em->block_start + diff;
- split->orig_start = em->orig_start;
+ split->ram_bytes = split->len;
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
}
ret = add_extent_mapping(em_tree, split, modified);
@@ -1709,7 +1727,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
- if (num_written > 0 || num_written == -EIOCBQUEUED) {
+ if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
num_written = err;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6d1b93c8aafb..7bdc83d04d54 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2166,16 +2166,23 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
continue;
- extent_offset = btrfs_file_extent_offset(leaf, extent);
- if (key.offset - extent_offset != offset)
+ /*
+ * 'offset' refers to the exact key.offset,
+ * NOT the 'offset' field in btrfs_extent_data_ref, ie.
+ * (key.offset - extent_offset).
+ */
+ if (key.offset != offset)
continue;
+ extent_offset = btrfs_file_extent_offset(leaf, extent);
num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+
if (extent_offset >= old->extent_offset + old->offset +
old->len || extent_offset + num_bytes <=
old->extent_offset + old->offset)
continue;
+ ret = 0;
break;
}
@@ -2187,7 +2194,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
backref->root_id = root_id;
backref->inum = inum;
- backref->file_pos = offset + extent_offset;
+ backref->file_pos = offset;
backref->num_bytes = num_bytes;
backref->extent_offset = extent_offset;
backref->generation = btrfs_file_extent_generation(leaf, extent);
@@ -2210,7 +2217,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
new->path = path;
list_for_each_entry_safe(old, tmp, &new->head, list) {
- ret = iterate_inodes_from_logical(old->bytenr, fs_info,
+ ret = iterate_inodes_from_logical(old->bytenr +
+ old->extent_offset, fs_info,
path, record_one_backref,
old);
BUG_ON(ret < 0 && ret != -ENOENT);
@@ -3166,7 +3174,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- ret = PTR_RET(inode);
+ ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ESTALE)
goto out;
@@ -4391,9 +4399,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
int mask = attr->ia_valid;
int ret;
- if (newsize == oldsize)
- return 0;
-
/*
* The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
* special case where we need to update the times despite not having
@@ -5165,14 +5170,31 @@ next:
}
/* Reached end of directory/root. Bump pos past the last item. */
- if (key_type == BTRFS_DIR_INDEX_KEY)
- /*
- * 32-bit glibc will use getdents64, but then strtol -
- * so the last number we can serve is this.
- */
- ctx->pos = 0x7fffffff;
- else
- ctx->pos++;
+ ctx->pos++;
+
+ /*
+ * Stop new entries from being returned after we return the last
+ * entry.
+ *
+ * New directory entries are assigned a strictly increasing
+ * offset. This means that new entries created during readdir
+ * are *guaranteed* to be seen in the future by that readdir.
+ * This has broken buggy programs which operate on names as
+ * they're returned by readdir. Until we re-use freed offsets
+ * we have this hack to stop new entries from being returned
+ * under the assumption that they'll never reach this huge
+ * offset.
+ *
+ * This is being careful not to overflow 32bit loff_t unless the
+ * last entry requires it because doing so has broken 32bit apps
+ * in the past.
+ */
+ if (key_type == BTRFS_DIR_INDEX_KEY) {
+ if (ctx->pos >= INT_MAX)
+ ctx->pos = LLONG_MAX;
+ else
+ ctx->pos = INT_MAX;
+ }
nopos:
ret = 0;
err:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4ba2a69a60ad..64a157becbe5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2495,7 +2495,7 @@ again:
ret = scrub_extent(sctx, extent_logical, extent_len,
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
- extent_physical);
+ extent_logical - logical + physical);
if (ret)
goto out;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d3f3b43cae0b..2e14fd89a8b4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -219,7 +219,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
len = PAGE_ALIGN(len);
if (p->buf == p->inline_buf) {
- tmp_buf = kmalloc(len, GFP_NOFS);
+ tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
if (!tmp_buf) {
tmp_buf = vmalloc(len);
if (!tmp_buf)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d58cce77fc6c..af1931a5960d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -983,12 +983,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
* a dirty root struct and adds it into the list of dead roots that need to
* be deleted
*/
-int btrfs_add_dead_root(struct btrfs_root *root)
+void btrfs_add_dead_root(struct btrfs_root *root)
{
spin_lock(&root->fs_info->trans_lock);
- list_add_tail(&root->root_list, &root->fs_info->dead_roots);
+ if (list_empty(&root->root_list))
+ list_add_tail(&root->root_list, &root->fs_info->dead_roots);
spin_unlock(&root->fs_info->trans_lock);
- return 0;
}
/*
@@ -1925,7 +1925,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
}
root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root, root_list);
- list_del(&root->root_list);
+ list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
pr_debug("btrfs: cleaner removing %llu\n",
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 005b0375d18c..defbc4269897 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -143,7 +143,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_add_dead_root(struct btrfs_root *root);
+void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2c6791493637..ff60d8978ae2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3746,8 +3746,9 @@ next_slot:
}
log_extents:
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
if (fast_search) {
- btrfs_release_path(dst_path);
ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
if (ret) {
err = ret;
@@ -3764,8 +3765,6 @@ log_extents:
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
- btrfs_release_path(path);
- btrfs_release_path(dst_path);
ret = log_directory_changes(trans, root, inode, path, dst_path);
if (ret) {
err = ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 78b871753cb6..67a085381845 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3302,7 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
- return PTR_RET(tsk);
+ return PTR_ERR_OR_ZERO(tsk);
}
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index d4c1206af9fc..43eb5592cdea 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -378,6 +378,31 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
}
/*
+ * check if the backing cache is updated to FS-Cache
+ * - called by FS-Cache when evaluates if need to invalidate the cache
+ */
+static bool cachefiles_check_consistency(struct fscache_operation *op)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+ int ret;
+
+ _enter("{OBJ%x}", op->object->debug_id);
+
+ object = container_of(op->object, struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = cachefiles_check_auxdata(object);
+ cachefiles_end_secure(cache, saved_cred);
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
* notification the attributes on an object have changed
* - called with reads/writes excluded by FS-Cache
*/
@@ -522,4 +547,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
.write_page = cachefiles_write_page,
.uncache_page = cachefiles_uncache_page,
.dissociate_pages = cachefiles_dissociate_pages,
+ .check_consistency = cachefiles_check_consistency,
};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 49382519907a..5349473df1b1 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -235,6 +235,7 @@ extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
struct cachefiles_xattr *auxdata);
extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_auxdata(struct cachefiles_object *object);
extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
struct cachefiles_xattr *auxdata);
extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 2476e5162609..34c88b83e39f 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -157,6 +157,42 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
}
/*
+ * check the consistency between the backing cache and the FS-Cache cookie
+ */
+int cachefiles_check_auxdata(struct cachefiles_object *object)
+{
+ struct cachefiles_xattr *auxbuf;
+ struct dentry *dentry = object->dentry;
+ unsigned int dlen;
+ int ret;
+
+ ASSERT(dentry);
+ ASSERT(dentry->d_inode);
+ ASSERT(object->fscache.cookie->def->check_aux);
+
+ auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+ if (!auxbuf)
+ return -ENOMEM;
+
+ auxbuf->len = vfs_getxattr(dentry, cachefiles_xattr_cache,
+ &auxbuf->type, 512 + 1);
+ if (auxbuf->len < 1)
+ return -ESTALE;
+
+ if (auxbuf->type != object->fscache.cookie->def->type)
+ return -ESTALE;
+
+ dlen = auxbuf->len - 1;
+ ret = fscache_check_aux(&object->fscache, &auxbuf->data, dlen);
+
+ kfree(auxbuf);
+ if (ret != FSCACHE_CHECKAUX_OKAY)
+ return -ESTALE;
+
+ return 0;
+}
+
+/*
* check the state xattr on a cache file
* - return -ESTALE if the object should be deleted
*/
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 49bc78243db9..ac9a2ef5bb9b 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -16,3 +16,12 @@ config CEPH_FS
If unsure, say N.
+if CEPH_FS
+config CEPH_FSCACHE
+ bool "Enable Ceph client caching support"
+ depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
+ help
+ Choose Y here to enable persistent, read-only local
+ caching support for Ceph clients using FS-Cache
+
+endif
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index bd352125e829..32e30106a2f0 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -9,3 +9,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o
+ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5318a3b704f6..6df8bd481425 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -11,6 +11,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/osd_client.h>
/*
@@ -70,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page)
struct address_space *mapping = page->mapping;
struct inode *inode;
struct ceph_inode_info *ci;
- int undo = 0;
struct ceph_snap_context *snapc;
+ int ret;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
- if (TestSetPageDirty(page)) {
+ if (PageDirty(page)) {
dout("%p set_page_dirty %p idx %lu -- already dirty\n",
mapping->host, page, page->index);
+ BUG_ON(!PagePrivate(page));
return 0;
}
@@ -107,35 +109,19 @@ static int ceph_set_page_dirty(struct page *page)
snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
- /* now adjust page */
- spin_lock_irq(&mapping->tree_lock);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, page->mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
-
- /*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
- */
- page->private = (unsigned long)snapc;
- SetPagePrivate(page);
- } else {
- dout("ANON set_page_dirty %p (raced truncate?)\n", page);
- undo = 1;
- }
-
- spin_unlock_irq(&mapping->tree_lock);
-
- if (undo)
- /* whoops, we failed to dirty the page */
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ /*
+ * Reference snap context in page->private. Also set
+ * PagePrivate so that we get invalidatepage callback.
+ */
+ BUG_ON(PagePrivate(page));
+ page->private = (unsigned long)snapc;
+ SetPagePrivate(page);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ ret = __set_page_dirty_nobuffers(page);
+ WARN_ON(!PageLocked(page));
+ WARN_ON(!page->mapping);
- BUG_ON(!PageDirty(page));
- return 1;
+ return ret;
}
/*
@@ -150,11 +136,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc = page_snap_context(page);
- BUG_ON(!PageLocked(page));
- BUG_ON(!PagePrivate(page));
- BUG_ON(!page->mapping);
-
inode = page->mapping->host;
+ ci = ceph_inode(inode);
+
+ if (offset != 0 || length != PAGE_CACHE_SIZE) {
+ dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
+ inode, page, page->index, offset, length);
+ return;
+ }
+
+ ceph_invalidate_fscache_page(inode, page);
+
+ if (!PagePrivate(page))
+ return;
/*
* We can get non-dirty pages here due to races between
@@ -164,31 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
if (!PageDirty(page))
pr_err("%p invalidatepage %p page not dirty\n", inode, page);
- if (offset == 0 && length == PAGE_CACHE_SIZE)
- ClearPageChecked(page);
+ ClearPageChecked(page);
- ci = ceph_inode(inode);
- if (offset == 0 && length == PAGE_CACHE_SIZE) {
- dout("%p invalidatepage %p idx %lu full dirty page\n",
- inode, page, page->index);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
- page->private = 0;
- ClearPagePrivate(page);
- } else {
- dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
- inode, page, page->index, offset, length);
- }
+ dout("%p invalidatepage %p idx %lu full dirty page\n",
+ inode, page, page->index);
+
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ page->private = 0;
+ ClearPagePrivate(page);
}
-/* just a sanity check */
static int ceph_releasepage(struct page *page, gfp_t g)
{
struct inode *inode = page->mapping ? page->mapping->host : NULL;
dout("%p releasepage %p idx %lu\n", inode, page, page->index);
WARN_ON(PageDirty(page));
- WARN_ON(PagePrivate(page));
- return 0;
+
+ /* Can we release the page from the cache? */
+ if (!ceph_release_fscache_page(page, g))
+ return 0;
+
+ return !PagePrivate(page);
}
/*
@@ -198,11 +189,16 @@ static int readpage_nounlock(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc =
+ struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
int err = 0;
u64 len = PAGE_CACHE_SIZE;
+ err = ceph_readpage_from_fscache(inode, page);
+
+ if (err == 0)
+ goto out;
+
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
@@ -220,6 +216,9 @@ static int readpage_nounlock(struct file *filp, struct page *page)
}
SetPageUptodate(page);
+ if (err == 0)
+ ceph_readpage_to_fscache(inode, page);
+
out:
return err < 0 ? err : 0;
}
@@ -262,6 +261,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
page->index);
flush_dcache_page(page);
SetPageUptodate(page);
+ ceph_readpage_to_fscache(inode, page);
unlock_page(page);
page_cache_release(page);
bytes -= PAGE_CACHE_SIZE;
@@ -331,11 +331,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
page = list_entry(page_list->prev, struct page, lru);
BUG_ON(PageLocked(page));
list_del(&page->lru);
-
+
dout("start_read %p adding %p idx %lu\n", inode, page,
page->index);
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
GFP_NOFS)) {
+ ceph_fscache_uncache_page(inode, page);
page_cache_release(page);
dout("start_read %p add_to_page_cache failed %p\n",
inode, page);
@@ -378,6 +379,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
int rc = 0;
int max = 0;
+ rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
+ &nr_pages);
+
+ if (rc == 0)
+ goto out;
+
if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
@@ -392,6 +399,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
BUG_ON(rc == 0);
}
out:
+ ceph_fscache_readpages_cancel(inode, page_list);
+
dout("readpages %p file %p ret %d\n", inode, file, rc);
return rc;
}
@@ -497,6 +506,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+ ceph_readpage_to_fscache(inode, page);
+
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
@@ -552,7 +563,6 @@ static void ceph_release_pages(struct page **pages, int num)
pagevec_release(&pvec);
}
-
/*
* async writeback completion handler.
*
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
new file mode 100644
index 000000000000..6bfe65e0b038
--- /dev/null
+++ b/fs/ceph/cache.c
@@ -0,0 +1,398 @@
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include "super.h"
+#include "cache.h"
+
+struct ceph_aux_inode {
+ struct timespec mtime;
+ loff_t size;
+};
+
+struct fscache_netfs ceph_cache_netfs = {
+ .name = "ceph",
+ .version = 0,
+};
+
+static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t maxbuf)
+{
+ const struct ceph_fs_client* fsc = cookie_netfs_data;
+ uint16_t klen;
+
+ klen = sizeof(fsc->client->fsid);
+ if (klen > maxbuf)
+ return 0;
+
+ memcpy(buffer, &fsc->client->fsid, klen);
+ return klen;
+}
+
+static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
+ .name = "CEPH.fsid",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = ceph_fscache_session_get_key,
+};
+
+int ceph_fscache_register(void)
+{
+ return fscache_register_netfs(&ceph_cache_netfs);
+}
+
+void ceph_fscache_unregister(void)
+{
+ fscache_unregister_netfs(&ceph_cache_netfs);
+}
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+ fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
+ &ceph_fscache_fsid_object_def,
+ fsc);
+
+ if (fsc->fscache == NULL) {
+ pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+ return 0;
+ }
+
+ fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
+ if (fsc->revalidate_wq == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t maxbuf)
+{
+ const struct ceph_inode_info* ci = cookie_netfs_data;
+ uint16_t klen;
+
+ /* use ceph virtual inode (id + snaphot) */
+ klen = sizeof(ci->i_vino);
+ if (klen > maxbuf)
+ return 0;
+
+ memcpy(buffer, &ci->i_vino, klen);
+ return klen;
+}
+
+static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ struct ceph_aux_inode aux;
+ const struct ceph_inode_info* ci = cookie_netfs_data;
+ const struct inode* inode = &ci->vfs_inode;
+
+ memset(&aux, 0, sizeof(aux));
+ aux.mtime = inode->i_mtime;
+ aux.size = inode->i_size;
+
+ memcpy(buffer, &aux, sizeof(aux));
+
+ return sizeof(aux);
+}
+
+static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
+ uint64_t *size)
+{
+ const struct ceph_inode_info* ci = cookie_netfs_data;
+ const struct inode* inode = &ci->vfs_inode;
+
+ *size = inode->i_size;
+}
+
+static enum fscache_checkaux ceph_fscache_inode_check_aux(
+ void *cookie_netfs_data, const void *data, uint16_t dlen)
+{
+ struct ceph_aux_inode aux;
+ struct ceph_inode_info* ci = cookie_netfs_data;
+ struct inode* inode = &ci->vfs_inode;
+
+ if (dlen != sizeof(aux))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&aux, 0, sizeof(aux));
+ aux.mtime = inode->i_mtime;
+ aux.size = inode->i_size;
+
+ if (memcmp(data, &aux, sizeof(aux)) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ dout("ceph inode 0x%p cached okay", ci);
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
+{
+ struct ceph_inode_info* ci = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ dout("ceph inode 0x%p now uncached", ci);
+
+ while (1) {
+ nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+
+ if (!nr_pages)
+ break;
+
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
+ .name = "CEPH.inode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = ceph_fscache_inode_get_key,
+ .get_attr = ceph_fscache_inode_get_attr,
+ .get_aux = ceph_fscache_inode_get_aux,
+ .check_aux = ceph_fscache_inode_check_aux,
+ .now_uncached = ceph_fscache_inode_now_uncached,
+};
+
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
+ struct ceph_inode_info* ci)
+{
+ struct inode* inode = &ci->vfs_inode;
+
+ /* No caching for filesystem */
+ if (fsc->fscache == NULL)
+ return;
+
+ /* Only cache for regular files that are read only */
+ if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
+ return;
+
+ /* Avoid multiple racing open requests */
+ mutex_lock(&inode->i_mutex);
+
+ if (ci->fscache)
+ goto done;
+
+ ci->fscache = fscache_acquire_cookie(fsc->fscache,
+ &ceph_fscache_inode_object_def,
+ ci);
+done:
+ mutex_unlock(&inode->i_mutex);
+
+}
+
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+ struct fscache_cookie* cookie;
+
+ if ((cookie = ci->fscache) == NULL)
+ return;
+
+ ci->fscache = NULL;
+
+ fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
+ fscache_relinquish_cookie(cookie, 0);
+}
+
+static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
+{
+ if (!error)
+ SetPageUptodate(page);
+}
+
+static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
+{
+ if (!error)
+ SetPageUptodate(page);
+
+ unlock_page(page);
+}
+
+static inline int cache_valid(struct ceph_inode_info *ci)
+{
+ return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
+ (ci->i_fscache_gen == ci->i_rdcache_gen));
+}
+
+
+/* Atempt to read from the fscache,
+ *
+ * This function is called from the readpage_nounlock context. DO NOT attempt to
+ * unlock the page here (or in the callback).
+ */
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!cache_valid(ci))
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_page(ci->fscache, page,
+ ceph_vfs_readpage_complete, NULL,
+ GFP_KERNEL);
+
+ switch (ret) {
+ case 0: /* Page found */
+ dout("page read submitted\n");
+ return 0;
+ case -ENOBUFS: /* Pages were not found, and can't be */
+ case -ENODATA: /* Pages were not found */
+ dout("page/inode not in cache\n");
+ return ret;
+ default:
+ dout("%s: unknown error ret = %i\n", __func__, ret);
+ return ret;
+ }
+}
+
+int ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!cache_valid(ci))
+ return -ENOBUFS;
+
+ ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
+ ceph_vfs_readpage_complete_unlock,
+ NULL, mapping_gfp_mask(mapping));
+
+ switch (ret) {
+ case 0: /* All pages found */
+ dout("all-page read submitted\n");
+ return 0;
+ case -ENOBUFS: /* Some pages were not found, and can't be */
+ case -ENODATA: /* some pages were not found */
+ dout("page/inode not in cache\n");
+ return ret;
+ default:
+ dout("%s: unknown error ret = %i\n", __func__, ret);
+ return ret;
+ }
+}
+
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!PageFsCache(page))
+ return;
+
+ if (!cache_valid(ci))
+ return;
+
+ ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
+ if (ret)
+ fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ fscache_wait_on_page_write(ci->fscache, page);
+ fscache_uncache_page(ci->fscache, page);
+}
+
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+ if (fsc->revalidate_wq)
+ destroy_workqueue(fsc->revalidate_wq);
+
+ fscache_relinquish_cookie(fsc->fscache, 0);
+ fsc->fscache = NULL;
+}
+
+static void ceph_revalidate_work(struct work_struct *work)
+{
+ int issued;
+ u32 orig_gen;
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_revalidate_work);
+ struct inode *inode = &ci->vfs_inode;
+
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ orig_gen = ci->i_rdcache_gen;
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (!(issued & CEPH_CAP_FILE_CACHE)) {
+ dout("revalidate_work lost cache before validation %p\n",
+ inode);
+ goto out;
+ }
+
+ if (!fscache_check_consistency(ci->fscache))
+ fscache_invalidate(ci->fscache);
+
+ spin_lock(&ci->i_ceph_lock);
+ /* Update the new valid generation (backwards sanity check too) */
+ if (orig_gen > ci->i_fscache_gen) {
+ ci->i_fscache_gen = orig_gen;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+out:
+ iput(&ci->vfs_inode);
+}
+
+void ceph_queue_revalidate(struct inode *inode)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (fsc->revalidate_wq == NULL || ci->fscache == NULL)
+ return;
+
+ ihold(inode);
+
+ if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
+ &ci->i_revalidate_work)) {
+ dout("ceph_queue_revalidate %p\n", inode);
+ } else {
+ dout("ceph_queue_revalidate %p failed\n)", inode);
+ iput(inode);
+ }
+}
+
+void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+ ci->fscache = NULL;
+ /* The first load is verifed cookie open time */
+ ci->i_fscache_gen = 1;
+ INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
+}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
new file mode 100644
index 000000000000..ba949408a336
--- /dev/null
+++ b/fs/ceph/cache.h
@@ -0,0 +1,159 @@
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#ifndef _CEPH_CACHE_H
+#define _CEPH_CACHE_H
+
+#ifdef CONFIG_CEPH_FSCACHE
+
+extern struct fscache_netfs ceph_cache_netfs;
+
+int ceph_fscache_register(void);
+void ceph_fscache_unregister(void);
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
+
+void ceph_fscache_inode_init(struct ceph_inode_info *ci);
+void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
+ struct ceph_inode_info* ci);
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+
+int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
+int ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages);
+void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
+void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
+void ceph_queue_revalidate(struct inode *inode);
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+ fscache_invalidate(ceph_inode(inode)->fscache);
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+ struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return fscache_uncache_page(ci->fscache, page);
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+ struct inode* inode = page->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return fscache_maybe_release_page(ci->fscache, page, gfp);
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+ struct list_head *pages)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return fscache_readpages_cancel(ci->fscache, pages);
+}
+
+#else
+
+static inline int ceph_fscache_register(void)
+{
+ return 0;
+}
+
+static inline void ceph_fscache_unregister(void)
+{
+}
+
+static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
+{
+ return 0;
+}
+
+static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+}
+
+static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
+{
+}
+
+static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
+ struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_uncache_page(struct inode *inode,
+ struct page *pages)
+{
+}
+
+static inline int ceph_readpage_from_fscache(struct inode* inode,
+ struct page *page)
+{
+ return -ENOBUFS;
+}
+
+static inline int ceph_readpages_from_fscache(struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+
+static inline void ceph_readpage_to_fscache(struct inode *inode,
+ struct page *page)
+{
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode)
+{
+}
+
+static inline void ceph_invalidate_fscache_page(struct inode *inode,
+ struct page *page)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
+{
+ return 1;
+}
+
+static inline void ceph_fscache_readpages_cancel(struct inode *inode,
+ struct list_head *pages)
+{
+}
+
+static inline void ceph_queue_revalidate(struct inode *inode)
+{
+}
+
+#endif
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 25442b40c25a..13976c33332e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -10,6 +10,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/decode.h>
#include <linux/ceph/messenger.h>
@@ -479,8 +480,9 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* i_rdcache_gen.
*/
if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
+ (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
+ }
/*
* if we are newly issued FILE_SHARED, mark dir not complete; we
@@ -2072,19 +2074,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
- if (!(need & CEPH_CAP_FILE_WR))
- mutex_lock(&inode->i_mutex);
__ceph_do_pending_vmtruncate(inode);
- if (!(need & CEPH_CAP_FILE_WR))
- mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
}
- if (need & CEPH_CAP_FILE_WR) {
+ have = __ceph_caps_issued(ci, &implemented);
+
+ if (have & need & CEPH_CAP_FILE_WR) {
if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
- if (endoff > ci->i_wanted_max_size) {
+ if (endoff > ci->i_requested_max_size) {
*check_max = 1;
ret = 1;
}
@@ -2099,7 +2099,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
goto out;
}
}
- have = __ceph_caps_issued(ci, &implemented);
if ((have & need) == need) {
/*
@@ -2141,14 +2140,17 @@ static void check_max_size(struct inode *inode, loff_t endoff)
/* do we need to explicitly request a larger max_size? */
spin_lock(&ci->i_ceph_lock);
- if ((endoff >= ci->i_max_size ||
- endoff > (inode->i_size << 1)) &&
- endoff > ci->i_wanted_max_size) {
+ if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
dout("write %p at large endoff %llu, req max_size\n",
inode, endoff);
ci->i_wanted_max_size = endoff;
- check = 1;
}
+ /* duplicate ceph_check_caps()'s logic */
+ if (ci->i_auth_cap &&
+ (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
+ ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size)
+ check = 1;
spin_unlock(&ci->i_ceph_lock);
if (check)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
@@ -2334,6 +2336,38 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
}
/*
+ * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
+ */
+static void invalidate_aliases(struct inode *inode)
+{
+ struct dentry *dn, *prev = NULL;
+
+ dout("invalidate_aliases inode %p\n", inode);
+ d_prune_aliases(inode);
+ /*
+ * For non-directory inode, d_find_alias() only returns
+ * connected dentry. After calling d_invalidate(), the
+ * dentry become disconnected.
+ *
+ * For directory inode, d_find_alias() can return
+ * disconnected dentry. But directory inode should have
+ * one alias at most.
+ */
+ while ((dn = d_find_alias(inode))) {
+ if (dn == prev) {
+ dput(dn);
+ break;
+ }
+ d_invalidate(dn);
+ if (prev)
+ dput(prev);
+ prev = dn;
+ }
+ if (prev)
+ dput(prev);
+}
+
+/*
* Handle a cap GRANT message from the MDS. (Note that a GRANT may
* actually be a revocation if it specifies a smaller cap set.)
*
@@ -2361,8 +2395,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
int check_caps = 0;
int wake = 0;
int writeback = 0;
- int revoked_rdcache = 0;
int queue_invalidate = 0;
+ int deleted_inode = 0;
+ int queue_revalidate = 0;
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2377,9 +2412,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!ci->i_wrbuffer_ref) {
- if (try_nonblocking_invalidate(inode) == 0) {
- revoked_rdcache = 1;
- } else {
+ if (try_nonblocking_invalidate(inode)) {
/* there were locked pages.. invalidate later
in a separate thread. */
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
@@ -2387,6 +2420,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
}
+
+ ceph_fscache_invalidate(inode);
}
/* side effects now are allowed */
@@ -2407,8 +2442,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
from_kgid(&init_user_ns, inode->i_gid));
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
+ if (inode->i_nlink == 0 &&
+ (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ deleted_inode = 1;
+ }
if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
int len = le32_to_cpu(grant->xattr_len);
@@ -2424,6 +2463,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
}
+ /* Do we need to revalidate our fscache cookie. Don't bother on the
+ * first cache cap as we already validate at cookie creation time. */
+ if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
+ queue_revalidate = 1;
+
/* size/ctime/mtime/atime? */
ceph_fill_file_size(inode, issued,
le32_to_cpu(grant->truncate_seq),
@@ -2508,6 +2552,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
BUG_ON(cap->issued & ~cap->implemented);
spin_unlock(&ci->i_ceph_lock);
+
if (writeback)
/*
* queue inode for writeback: we can't actually call
@@ -2517,6 +2562,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_writeback(inode);
if (queue_invalidate)
ceph_queue_invalidate(inode);
+ if (deleted_inode)
+ invalidate_aliases(inode);
+ if (queue_revalidate)
+ ceph_queue_revalidate(inode);
if (wake)
wake_up_all(&ci->i_cap_wq);
@@ -2673,8 +2722,10 @@ static void handle_cap_trunc(struct inode *inode,
truncate_seq, truncate_size, size);
spin_unlock(&ci->i_ceph_lock);
- if (queue_trunc)
+ if (queue_trunc) {
ceph_queue_vmtruncate(inode);
+ ceph_fscache_invalidate(inode);
+ }
}
/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a40ceda47a32..868b61d56cac 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -793,6 +793,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
req->r_locked_dir = dir;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ /* release LINK_SHARED on source inode (mds will lock it) */
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
err = ceph_mdsc_do_request(mdsc, dir, req);
if (err) {
d_drop(dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2ddf061c1c4a..3de89829e2a1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -8,9 +8,11 @@
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/aio.h>
+#include <linux/falloc.h>
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
/*
* Ceph file operations
@@ -68,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
struct ceph_file_info *cf;
int ret = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
+ /* First file open request creates the cookie, we want to keep
+ * this cookie around for the filetime of the inode as not to
+ * have to worry about fscache register / revoke / operation
+ * races.
+ *
+ * Also, if we know the operation is going to invalidate data
+ * (non readonly) just nuke the cache right away.
+ */
+ ceph_fscache_register_inode_cookie(mdsc->fsc, ci);
+ if ((fmode & CEPH_FILE_MODE_WR))
+ ceph_fscache_invalidate(inode);
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
@@ -181,6 +197,7 @@ int ceph_open(struct inode *inode, struct file *file)
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
+
spin_unlock(&ci->i_ceph_lock);
dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
@@ -191,6 +208,7 @@ int ceph_open(struct inode *inode, struct file *file)
}
req->r_inode = inode;
ihold(inode);
+
req->r_num_caps = 1;
if (flags & (O_CREAT|O_TRUNC))
parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
@@ -313,9 +331,9 @@ static int striped_read(struct inode *inode,
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- u64 pos, this_len;
+ u64 pos, this_len, left;
int io_align, page_align;
- int left, pages_left;
+ int pages_left;
int read;
struct page **page_pos;
int ret;
@@ -346,47 +364,40 @@ more:
ret = 0;
hit_stripe = this_len < left;
was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+ dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
- if (ret > 0) {
- int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
-
- if (read < pos - off) {
- dout(" zero gap %llu to %llu\n", off + read, pos);
- ceph_zero_page_vector_range(page_align + read,
- pos - off - read, pages);
+ if (ret >= 0) {
+ int didpages;
+ if (was_short && (pos + ret < inode->i_size)) {
+ u64 tmp = min(this_len - ret,
+ inode->i_size - pos - ret);
+ dout(" zero gap %llu to %llu\n",
+ pos + ret, pos + ret + tmp);
+ ceph_zero_page_vector_range(page_align + read + ret,
+ tmp, pages);
+ ret += tmp;
}
+
+ didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
pos += ret;
read = pos - off;
left -= ret;
page_pos += didpages;
pages_left -= didpages;
- /* hit stripe? */
- if (left && hit_stripe)
+ /* hit stripe and need continue*/
+ if (left && hit_stripe && pos < inode->i_size)
goto more;
}
- if (was_short) {
+ if (read > 0) {
+ ret = read;
/* did we bounce off eof? */
if (pos + left > inode->i_size)
*checkeof = 1;
-
- /* zero trailing bytes (inside i_size) */
- if (left > 0 && pos < inode->i_size) {
- if (pos + left > inode->i_size)
- left = inode->i_size - pos;
-
- dout("zero tail %d\n", left);
- ceph_zero_page_vector_range(page_align + read, left,
- pages);
- read += left;
- }
}
- if (ret >= 0)
- ret = read;
dout("striped_read returns %d\n", ret);
return ret;
}
@@ -618,6 +629,8 @@ out:
if (check_caps)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
NULL);
+ } else if (ret != -EOLDSNAPC && written > 0) {
+ ret = written;
}
return ret;
}
@@ -659,7 +672,6 @@ again:
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
(fi->flags & CEPH_F_SYNC))
/* hmm, this isn't really async... */
ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
@@ -711,13 +723,11 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
&ceph_sb_to_client(inode->i_sb)->client->osdc;
ssize_t count, written = 0;
int err, want, got;
- bool hold_mutex;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
mutex_lock(&inode->i_mutex);
- hold_mutex = true;
err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
if (err)
@@ -763,18 +773,31 @@ retry_snap:
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
(fi->flags & CEPH_F_SYNC)) {
mutex_unlock(&inode->i_mutex);
written = ceph_sync_write(file, iov->iov_base, count,
pos, &iocb->ki_pos);
+ if (written == -EOLDSNAPC) {
+ dout("aio_write %p %llx.%llx %llu~%u"
+ "got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode),
+ pos, (unsigned)iov->iov_len);
+ mutex_lock(&inode->i_mutex);
+ goto retry_snap;
+ }
} else {
+ /*
+ * No need to acquire the i_truncate_mutex. Because
+ * the MDS revokes Fwb caps before sending truncate
+ * message to us. We can't get Fwb cap while there
+ * are pending vmtruncate. So write and vmtruncate
+ * can not run at the same time
+ */
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, &iocb->ki_pos,
count, 0);
mutex_unlock(&inode->i_mutex);
}
- hold_mutex = false;
if (written >= 0) {
int dirty;
@@ -798,18 +821,12 @@ retry_snap:
written = err;
}
- if (written == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
- mutex_lock(&inode->i_mutex);
- hold_mutex = true;
- goto retry_snap;
- }
+ goto out_unlocked;
+
out:
- if (hold_mutex)
- mutex_unlock(&inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+out_unlocked:
current->backing_dev_info = NULL;
-
return written ? written : err;
}
@@ -822,7 +839,6 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
int ret;
mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
@@ -871,6 +887,204 @@ out:
return offset;
}
+static inline void ceph_zero_partial_page(
+ struct inode *inode, loff_t offset, unsigned size)
+{
+ struct page *page;
+ pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+ page = find_lock_page(inode->i_mapping, index);
+ if (page) {
+ wait_on_page_writeback(page);
+ zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+ loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+ if (offset < nearly) {
+ loff_t size = nearly - offset;
+ if (length < size)
+ size = length;
+ ceph_zero_partial_page(inode, offset, size);
+ offset += size;
+ length -= size;
+ }
+ if (length >= PAGE_CACHE_SIZE) {
+ loff_t size = round_down(length, PAGE_CACHE_SIZE);
+ truncate_pagecache_range(inode, offset, offset + size - 1);
+ offset += size;
+ length -= size;
+ }
+ if (length)
+ ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+ loff_t offset, loff_t *length)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_request *req;
+ int ret = 0;
+ loff_t zero = 0;
+ int op;
+
+ if (!length) {
+ op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+ length = &zero;
+ } else {
+ op = CEPH_OSD_OP_ZERO;
+ }
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode),
+ offset, length,
+ 1, op,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ NULL, 0, 0, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out;
+ }
+
+ ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
+ &inode->i_mtime);
+
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret) {
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
+ }
+ ceph_osdc_put_request(req);
+
+out:
+ return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+ int ret = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
+ s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+ u64 object_set_size = object_size * stripe_count;
+ u64 nearly, t;
+
+ /* round offset up to next period boundary */
+ nearly = offset + object_set_size - 1;
+ t = nearly;
+ nearly -= do_div(t, object_set_size);
+
+ while (length && offset < nearly) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ while (length >= object_set_size) {
+ int i;
+ loff_t pos = offset;
+ for (i = 0; i < stripe_count; ++i) {
+ ret = ceph_zero_partial_object(inode, pos, NULL);
+ if (ret < 0)
+ return ret;
+ pos += stripe_unit;
+ }
+ offset += object_set_size;
+ length -= object_set_size;
+ }
+ while (length) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ return ret;
+}
+
+static long ceph_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t length)
+{
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
+ int want, got = 0;
+ int dirty;
+ int ret = 0;
+ loff_t endoff = 0;
+ loff_t size;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto unlock;
+ }
+
+ if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) &&
+ !(mode & FALLOC_FL_PUNCH_HOLE)) {
+ ret = -ENOSPC;
+ goto unlock;
+ }
+
+ size = i_size_read(inode);
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ endoff = offset + length;
+
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
+ if (ret < 0)
+ goto unlock;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (offset < size)
+ ceph_zero_pagecache_range(inode, offset, length);
+ ret = ceph_zero_objects(inode, offset, length);
+ } else if (endoff > size) {
+ truncate_pagecache_range(inode, size, -1);
+ if (ceph_inode_set_size(inode, endoff))
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY, NULL);
+ }
+
+ if (!ret) {
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ ceph_put_cap_refs(ci, got);
+unlock:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
@@ -887,5 +1101,6 @@ const struct file_operations ceph_file_fops = {
.splice_write = generic_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
+ .fallocate = ceph_fallocate,
};
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f3a2abf28a77..8549a48115f7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -12,6 +12,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/decode.h>
/*
@@ -344,6 +345,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
ci->i_nr_by_mode[i] = 0;
+ mutex_init(&ci->i_truncate_mutex);
ci->i_truncate_seq = 0;
ci->i_truncate_size = 0;
ci->i_truncate_pending = 0;
@@ -377,6 +379,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+ ceph_fscache_inode_init(ci);
+
return &ci->vfs_inode;
}
@@ -396,6 +400,8 @@ void ceph_destroy_inode(struct inode *inode)
dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ ceph_fscache_unregister_inode_cookie(ci);
+
ceph_queue_caps_release(inode);
/*
@@ -430,7 +436,6 @@ void ceph_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, ceph_i_callback);
}
-
/*
* Helpers to fill in size, ctime, mtime, and atime. We have to be
* careful because either the client or MDS may have more up to date
@@ -455,16 +460,20 @@ int ceph_fill_file_size(struct inode *inode, int issued,
dout("truncate_seq %u -> %u\n",
ci->i_truncate_seq, truncate_seq);
ci->i_truncate_seq = truncate_seq;
+
+ /* the MDS should have revoked these caps */
+ WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD |
+ CEPH_CAP_FILE_WR |
+ CEPH_CAP_FILE_LAZYIO));
/*
* If we hold relevant caps, or in the case where we're
* not the only client referencing this file and we
* don't hold those caps, then we need to check whether
* the file is either opened or mmaped
*/
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
- CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
- CEPH_CAP_FILE_EXCL|
- CEPH_CAP_FILE_LAZYIO)) ||
+ if ((issued & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
__ceph_caps_file_wanted(ci)) {
ci->i_truncate_pending++;
@@ -478,6 +487,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
truncate_size);
ci->i_truncate_size = truncate_size;
}
+
+ if (queue_trunc)
+ ceph_fscache_invalidate(inode);
+
return queue_trunc;
}
@@ -1066,7 +1079,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
* complete.
*/
ceph_set_dentry_offset(req->r_old_dentry);
- dout("dn %p gets new offset %lld\n", req->r_old_dentry,
+ dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
dn = req->r_old_dentry; /* use old_dentry */
@@ -1419,18 +1432,20 @@ static void ceph_invalidate_work(struct work_struct *work)
u32 orig_gen;
int check = 0;
+ mutex_lock(&ci->i_truncate_mutex);
spin_lock(&ci->i_ceph_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
/* nevermind! */
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
goto out;
}
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(inode->i_mapping, 0);
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
@@ -1445,6 +1460,7 @@ static void ceph_invalidate_work(struct work_struct *work)
ci->i_rdcache_revoking);
}
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
if (check)
ceph_check_caps(ci, 0, NULL);
@@ -1465,9 +1481,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)
struct inode *inode = &ci->vfs_inode;
dout("vmtruncate_work %p\n", inode);
- mutex_lock(&inode->i_mutex);
__ceph_do_pending_vmtruncate(inode);
- mutex_unlock(&inode->i_mutex);
iput(inode);
}
@@ -1480,6 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
ihold(inode);
+
if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
&ci->i_vmtruncate_work)) {
dout("ceph_queue_vmtruncate %p\n", inode);
@@ -1500,11 +1515,13 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
u64 to;
int wrbuffer_refs, finish = 0;
+ mutex_lock(&ci->i_truncate_mutex);
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) {
dout("__do_pending_vmtruncate %p none pending\n", inode);
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
return;
}
@@ -1521,6 +1538,9 @@ retry:
goto retry;
}
+ /* there should be no reader or writer */
+ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
+
to = ci->i_truncate_size;
wrbuffer_refs = ci->i_wrbuffer_ref;
dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
@@ -1538,13 +1558,14 @@ retry:
if (!finish)
goto retry;
+ mutex_unlock(&ci->i_truncate_mutex);
+
if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
wake_up_all(&ci->i_cap_wq);
}
-
/*
* symlinks
*/
@@ -1586,8 +1607,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- __ceph_do_pending_vmtruncate(inode);
-
err = inode_change_ok(inode, attr);
if (err != 0)
return err;
@@ -1768,7 +1787,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- __ceph_do_pending_vmtruncate(inode);
+ if (mask & CEPH_SETATTR_SIZE)
+ __ceph_do_pending_vmtruncate(inode);
return err;
out:
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e0b4ef31d3c8..669622fd1ae3 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -196,8 +196,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset,
&olen);
- if (r < 0)
+ if (r < 0) {
+ up_read(&osdc->map_sem);
return -EIO;
+ }
dl.file_offset -= dl.object_offset;
dl.object_size = ceph_file_layout_object_size(ci->i_layout);
dl.block_size = ceph_file_layout_su(ci->i_layout);
@@ -209,8 +211,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
- ceph_file_layout_pg_pool(ci->i_layout));
+ r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
+ ceph_file_layout_pg_pool(ci->i_layout));
+ if (r < 0) {
+ up_read(&osdc->map_sem);
+ return r;
+ }
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
if (dl.osd >= 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 187bf214444d..b7bda5d9611d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -414,6 +414,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *s;
+ if (mds >= mdsc->mdsmap->m_max_mds)
+ return ERR_PTR(-EINVAL);
+
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s)
return ERR_PTR(-ENOMEM);
@@ -1028,6 +1031,37 @@ static void remove_session_caps(struct ceph_mds_session *session)
{
dout("remove_session_caps on %p\n", session);
iterate_session_caps(session, remove_session_caps_cb, NULL);
+
+ spin_lock(&session->s_cap_lock);
+ if (session->s_nr_caps > 0) {
+ struct super_block *sb = session->s_mdsc->fsc->sb;
+ struct inode *inode;
+ struct ceph_cap *cap, *prev = NULL;
+ struct ceph_vino vino;
+ /*
+ * iterate_session_caps() skips inodes that are being
+ * deleted, we need to wait until deletions are complete.
+ * __wait_on_freeing_inode() is designed for the job,
+ * but it is not exported, so use lookup inode function
+ * to access it.
+ */
+ while (!list_empty(&session->s_caps)) {
+ cap = list_entry(session->s_caps.next,
+ struct ceph_cap, session_caps);
+ if (cap == prev)
+ break;
+ prev = cap;
+ vino = cap->ci->i_vino;
+ spin_unlock(&session->s_cap_lock);
+
+ inode = ceph_find_inode(sb, vino);
+ iput(inode);
+
+ spin_lock(&session->s_cap_lock);
+ }
+ }
+ spin_unlock(&session->s_cap_lock);
+
BUG_ON(session->s_nr_caps > 0);
BUG_ON(!list_empty(&session->s_cap_flushing));
cleanup_cap_releases(session);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6627b26a800c..6a0951e43044 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -17,6 +17,7 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/decode.h>
@@ -142,6 +143,8 @@ enum {
Opt_nodcache,
Opt_ino32,
Opt_noino32,
+ Opt_fscache,
+ Opt_nofscache
};
static match_table_t fsopt_tokens = {
@@ -167,6 +170,8 @@ static match_table_t fsopt_tokens = {
{Opt_nodcache, "nodcache"},
{Opt_ino32, "ino32"},
{Opt_noino32, "noino32"},
+ {Opt_fscache, "fsc"},
+ {Opt_nofscache, "nofsc"},
{-1, NULL}
};
@@ -260,6 +265,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noino32:
fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
break;
+ case Opt_fscache:
+ fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+ break;
+ case Opt_nofscache:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
+ break;
default:
BUG_ON(token);
}
@@ -422,6 +433,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",dcache");
else
seq_puts(m, ",nodcache");
+ if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
+ seq_puts(m, ",fsc");
+ else
+ seq_puts(m, ",nofsc");
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -530,11 +545,18 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->wb_pagevec_pool)
goto fail_trunc_wq;
+ /* setup fscache */
+ if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
+ (ceph_fscache_register_fs(fsc) != 0))
+ goto fail_fscache;
+
/* caps */
fsc->min_caps = fsopt->max_readdir;
return fsc;
+fail_fscache:
+ ceph_fscache_unregister_fs(fsc);
fail_trunc_wq:
destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
@@ -554,6 +576,8 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{
dout("destroy_fs_client %p\n", fsc);
+ ceph_fscache_unregister_fs(fsc);
+
destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
@@ -588,6 +612,8 @@ static void ceph_inode_init_once(void *foo)
static int __init init_caches(void)
{
+ int error = -ENOMEM;
+
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
@@ -611,15 +637,17 @@ static int __init init_caches(void)
if (ceph_file_cachep == NULL)
goto bad_file;
- return 0;
+ if ((error = ceph_fscache_register()))
+ goto bad_file;
+ return 0;
bad_file:
kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
- return -ENOMEM;
+ return error;
}
static void destroy_caches(void)
@@ -629,10 +657,13 @@ static void destroy_caches(void)
* destroy cache.
*/
rcu_barrier();
+
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
+
+ ceph_fscache_unregister();
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cbded572345e..6014b0a3c405 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -16,6 +16,10 @@
#include <linux/ceph/libceph.h>
+#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/fscache.h>
+#endif
+
/* f_type in struct statfs */
#define CEPH_SUPER_MAGIC 0x00c36400
@@ -29,6 +33,7 @@
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
+#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
@@ -90,6 +95,11 @@ struct ceph_fs_client {
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
#endif
+
+#ifdef CONFIG_CEPH_FSCACHE
+ struct fscache_cookie *fscache;
+ struct workqueue_struct *revalidate_wq;
+#endif
};
@@ -288,6 +298,7 @@ struct ceph_inode_info {
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+ struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
u64 i_truncate_size; /* and the size we last truncated down to */
int i_truncate_pending; /* still need to call vmtruncate */
@@ -319,6 +330,12 @@ struct ceph_inode_info {
struct work_struct i_vmtruncate_work;
+#ifdef CONFIG_CEPH_FSCACHE
+ struct fscache_cookie *fscache;
+ u32 i_fscache_gen; /* sequence, for delayed fscache validate */
+ struct work_struct i_revalidate_work;
+#endif
+
struct inode vfs_inode; /* at end */
};
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 45e57cc38200..fc6f4f3a1a9d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -43,17 +43,18 @@ cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
if (IS_ERR(server->secmech.md5)) {
cifs_dbg(VFS, "could not allocate crypto md5\n");
- return PTR_ERR(server->secmech.md5);
+ rc = PTR_ERR(server->secmech.md5);
+ server->secmech.md5 = NULL;
+ return rc;
}
size = sizeof(struct shash_desc) +
crypto_shash_descsize(server->secmech.md5);
server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
if (!server->secmech.sdescmd5) {
- rc = -ENOMEM;
crypto_free_shash(server->secmech.md5);
server->secmech.md5 = NULL;
- return rc;
+ return -ENOMEM;
}
server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
server->secmech.sdescmd5->shash.flags = 0x0;
@@ -421,7 +422,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (blobptr + attrsize > blobend)
break;
if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
- if (!attrsize)
+ if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN)
break;
if (!ses->domainName) {
ses->domainName =
@@ -591,6 +592,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
{
+ int rc;
unsigned int size;
/* check if already allocated */
@@ -600,7 +602,9 @@ static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
if (IS_ERR(server->secmech.hmacmd5)) {
cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
- return PTR_ERR(server->secmech.hmacmd5);
+ rc = PTR_ERR(server->secmech.hmacmd5);
+ server->secmech.hmacmd5 = NULL;
+ return rc;
}
size = sizeof(struct shash_desc) +
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 4bdd547dbf6f..85ea98d139fc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -147,18 +147,17 @@ cifs_read_super(struct super_block *sb)
goto out_no_root;
}
+ if (cifs_sb_master_tcon(cifs_sb)->nocase)
+ sb->s_d_op = &cifs_ci_dentry_ops;
+ else
+ sb->s_d_op = &cifs_dentry_ops;
+
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
rc = -ENOMEM;
goto out_no_root;
}
- /* do that *after* d_make_root() - we want NULL ->d_op for root here */
- if (cifs_sb_master_tcon(cifs_sb)->nocase)
- sb->s_d_op = &cifs_ci_dentry_ops;
- else
- sb->s_d_op = &cifs_dentry_ops;
-
#ifdef CONFIG_CIFS_NFSD_EXPORT
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cifs_dbg(FYI, "export ops supported\n");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1fdc37041057..52ca861ed35e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -44,6 +44,7 @@
#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
#define MAX_SERVER_SIZE 15
#define MAX_SHARE_SIZE 80
+#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */
#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */
#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
@@ -369,6 +370,9 @@ struct smb_version_operations {
void (*generate_signingkey)(struct TCP_Server_Info *server);
int (*calc_signature)(struct smb_rqst *rqst,
struct TCP_Server_Info *server);
+ int (*query_mf_symlink)(const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+ unsigned int xid);
};
struct smb_version_values {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f7e584d047e2..b29a012bed33 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -497,5 +497,7 @@ void cifs_writev_complete(struct work_struct *work);
struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
work_func_t complete);
void cifs_writedata_release(struct kref *refcount);
-
+int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+ unsigned int xid);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index fa68813396b5..d67c550c4980 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1675,7 +1675,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (string == NULL)
goto out_nomem;
- if (strnlen(string, 256) == 256) {
+ if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN)
+ == CIFS_MAX_DOMAINNAME_LEN) {
printk(KERN_WARNING "CIFS: domain name too"
" long\n");
goto cifs_parse_mount_err;
@@ -2276,8 +2277,8 @@ cifs_put_smb_ses(struct cifs_ses *ses)
#ifdef CONFIG_KEYS
-/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
-#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1)
/* Populate username and pw fields from keyring if possible */
static int
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1e57f36ea1b2..9d0dd952ad79 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -647,6 +647,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
oflags, &oplock, &cfile->fid.netfid, xid);
if (rc == 0) {
cifs_dbg(FYI, "posix reopen succeeded\n");
+ oparms.reconnect = true;
goto reopen_success;
}
/*
@@ -2552,7 +2553,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
mutex_unlock(&inode->i_mutex);
}
- if (rc > 0 || rc == -EIOCBQUEUED) {
+ if (rc > 0) {
ssize_t err;
err = generic_write_sync(file, pos, rc);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index b83c3f5646bd..562044f700e5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -305,67 +305,89 @@ CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
}
int
-CIFSCheckMFSymlink(struct cifs_fattr *fattr,
- const unsigned char *path,
- struct cifs_sb_info *cifs_sb, unsigned int xid)
+open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
+ unsigned int xid)
{
int rc;
int oplock = 0;
__u16 netfid = 0;
struct tcon_link *tlink;
- struct cifs_tcon *pTcon;
+ struct cifs_tcon *ptcon;
struct cifs_io_parms io_parms;
- u8 *buf;
- char *pbuf;
- unsigned int bytes_read = 0;
int buf_type = CIFS_NO_BUFFER;
- unsigned int link_len = 0;
FILE_ALL_INFO file_info;
- if (!CIFSCouldBeMFSymlink(fattr))
- /* it's not a symlink */
- return 0;
-
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
- pTcon = tlink_tcon(tlink);
+ ptcon = tlink_tcon(tlink);
- rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ,
+ rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ,
CREATE_NOT_DIR, &netfid, &oplock, &file_info,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc != 0)
- goto out;
+ if (rc != 0) {
+ cifs_put_tlink(tlink);
+ return rc;
+ }
if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
- CIFSSMBClose(xid, pTcon, netfid);
+ CIFSSMBClose(xid, ptcon, netfid);
+ cifs_put_tlink(tlink);
/* it's not a symlink */
- goto out;
+ return rc;
}
- buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
- if (!buf) {
- rc = -ENOMEM;
- goto out;
- }
- pbuf = buf;
io_parms.netfid = netfid;
io_parms.pid = current->tgid;
- io_parms.tcon = pTcon;
+ io_parms.tcon = ptcon;
io_parms.offset = 0;
io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
- CIFSSMBClose(xid, pTcon, netfid);
- if (rc != 0) {
- kfree(buf);
+ rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+ CIFSSMBClose(xid, ptcon, netfid);
+ cifs_put_tlink(tlink);
+ return rc;
+}
+
+
+int
+CIFSCheckMFSymlink(struct cifs_fattr *fattr,
+ const unsigned char *path,
+ struct cifs_sb_info *cifs_sb, unsigned int xid)
+{
+ int rc = 0;
+ u8 *buf = NULL;
+ unsigned int link_len = 0;
+ unsigned int bytes_read = 0;
+ struct cifs_tcon *ptcon;
+
+ if (!CIFSCouldBeMFSymlink(fattr))
+ /* it's not a symlink */
+ return 0;
+
+ buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
+ if (!buf) {
+ rc = -ENOMEM;
goto out;
}
+ ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
+ if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink))
+ rc = ptcon->ses->server->ops->query_mf_symlink(path, buf,
+ &bytes_read, cifs_sb, xid);
+ else
+ goto out;
+
+ if (rc != 0)
+ goto out;
+
+ if (bytes_read == 0) /* not a symlink */
+ goto out;
+
rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
- kfree(buf);
if (rc == -EINVAL) {
/* it's not a symlink */
rc = 0;
@@ -381,7 +403,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
fattr->cf_dtype = DT_LNK;
out:
- cifs_put_tlink(tlink);
+ kfree(buf);
return rc;
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ab8778469394..69d2c826a23b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -111,6 +111,14 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
return;
}
+ /*
+ * If we know that the inode will need to be revalidated immediately,
+ * then don't create a new dentry for it. We'll end up doing an on
+ * the wire call either way and this spares us an invalidation.
+ */
+ if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
+ return;
+
dentry = d_alloc(parent, name);
if (!dentry)
return;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 79358e341fd2..08dd37bb23aa 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -197,7 +197,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
bytes_ret = 0;
} else
bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
- 256, nls_cp);
+ CIFS_MAX_DOMAINNAME_LEN, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null terminator */
@@ -255,8 +255,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
/* copy domain */
if (ses->domainName != NULL) {
- strncpy(bcc_ptr, ses->domainName, 256);
- bcc_ptr += strnlen(ses->domainName, 256);
+ strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+ bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
} /* else we will send a null domain name
so the server will default to its own domain */
*bcc_ptr = 0;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 6457690731a2..60943978aec3 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -944,6 +944,7 @@ struct smb_version_operations smb1_operations = {
.mand_lock = cifs_mand_lock,
.mand_unlock_range = cifs_unlock_range,
.push_mand_locks = cifs_push_mandatory_locks,
+ .query_mf_symlink = open_query_close_cifs_symlink,
};
struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 301b191270b9..4f2300d020c7 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -42,6 +42,7 @@
static int
smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
{
+ int rc;
unsigned int size;
if (server->secmech.sdeschmacsha256 != NULL)
@@ -50,7 +51,9 @@ smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
if (IS_ERR(server->secmech.hmacsha256)) {
cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
- return PTR_ERR(server->secmech.hmacsha256);
+ rc = PTR_ERR(server->secmech.hmacsha256);
+ server->secmech.hmacsha256 = NULL;
+ return rc;
}
size = sizeof(struct shash_desc) +
@@ -87,7 +90,9 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
server->secmech.sdeschmacsha256 = NULL;
crypto_free_shash(server->secmech.hmacsha256);
server->secmech.hmacsha256 = NULL;
- return PTR_ERR(server->secmech.cmacaes);
+ rc = PTR_ERR(server->secmech.cmacaes);
+ server->secmech.cmacaes = NULL;
+ return rc;
}
size = sizeof(struct shash_desc) +
diff --git a/fs/dcache.c b/fs/dcache.c
index 87bdb5329c3c..ca8e9cd60f87 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -229,7 +229,7 @@ static void __d_free(struct rcu_head *head)
*/
static void d_free(struct dentry *dentry)
{
- BUG_ON(dentry->d_count);
+ BUG_ON((int)dentry->d_lockref.count > 0);
this_cpu_dec(nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
@@ -308,8 +308,9 @@ static void dentry_unlink_inode(struct dentry * dentry)
*/
static void dentry_lru_add(struct dentry *dentry)
{
- if (list_empty(&dentry->d_lru)) {
+ if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
spin_lock(&dcache_lru_lock);
+ dentry->d_flags |= DCACHE_LRU_LIST;
list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
dentry->d_sb->s_nr_dentry_unused++;
dentry_stat.nr_unused++;
@@ -320,7 +321,7 @@ static void dentry_lru_add(struct dentry *dentry)
static void __dentry_lru_del(struct dentry *dentry)
{
list_del_init(&dentry->d_lru);
- dentry->d_flags &= ~DCACHE_SHRINK_LIST;
+ dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
dentry->d_sb->s_nr_dentry_unused--;
dentry_stat.nr_unused--;
}
@@ -341,6 +342,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
{
spin_lock(&dcache_lru_lock);
if (list_empty(&dentry->d_lru)) {
+ dentry->d_flags |= DCACHE_LRU_LIST;
list_add_tail(&dentry->d_lru, list);
dentry->d_sb->s_nr_dentry_unused++;
dentry_stat.nr_unused++;
@@ -443,7 +445,7 @@ EXPORT_SYMBOL(d_drop);
* If ref is non-zero, then decrement the refcount too.
* Returns dentry requiring refcount drop, or NULL if we're done.
*/
-static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+static inline struct dentry *dentry_kill(struct dentry *dentry)
__releases(dentry->d_lock)
{
struct inode *inode;
@@ -466,13 +468,16 @@ relock:
goto relock;
}
- if (ref)
- dentry->d_count--;
+ /*
+ * The dentry is now unrecoverably dead to the world.
+ */
+ lockref_mark_dead(&dentry->d_lockref);
+
/*
* inform the fs via d_prune that this dentry is about to be
* unhashed and destroyed.
*/
- if (dentry->d_flags & DCACHE_OP_PRUNE)
+ if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
dentry->d_op->d_prune(dentry);
dentry_lru_del(dentry);
@@ -509,38 +514,31 @@ relock:
*/
void dput(struct dentry *dentry)
{
- if (!dentry)
+ if (unlikely(!dentry))
return;
repeat:
- if (dentry->d_count == 1)
- might_sleep();
- spin_lock(&dentry->d_lock);
- BUG_ON(!dentry->d_count);
- if (dentry->d_count > 1) {
- dentry->d_count--;
- spin_unlock(&dentry->d_lock);
+ if (lockref_put_or_lock(&dentry->d_lockref))
return;
- }
- if (dentry->d_flags & DCACHE_OP_DELETE) {
+ /* Unreachable? Get rid of it */
+ if (unlikely(d_unhashed(dentry)))
+ goto kill_it;
+
+ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
if (dentry->d_op->d_delete(dentry))
goto kill_it;
}
- /* Unreachable? Get rid of it */
- if (d_unhashed(dentry))
- goto kill_it;
-
dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
- dentry->d_count--;
+ dentry->d_lockref.count--;
spin_unlock(&dentry->d_lock);
return;
kill_it:
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry);
if (dentry)
goto repeat;
}
@@ -590,7 +588,7 @@ int d_invalidate(struct dentry * dentry)
* We also need to leave mountpoints alone,
* directory or not.
*/
- if (dentry->d_count > 1 && dentry->d_inode) {
+ if (dentry->d_lockref.count > 1 && dentry->d_inode) {
if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
spin_unlock(&dentry->d_lock);
return -EBUSY;
@@ -606,20 +604,33 @@ EXPORT_SYMBOL(d_invalidate);
/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
{
- dentry->d_count++;
+ dentry->d_lockref.count++;
}
static inline void __dget(struct dentry *dentry)
{
- spin_lock(&dentry->d_lock);
- __dget_dlock(dentry);
- spin_unlock(&dentry->d_lock);
+ lockref_get(&dentry->d_lockref);
}
struct dentry *dget_parent(struct dentry *dentry)
{
+ int gotref;
struct dentry *ret;
+ /*
+ * Do optimistic parent lookup without any
+ * locking.
+ */
+ rcu_read_lock();
+ ret = ACCESS_ONCE(dentry->d_parent);
+ gotref = lockref_get_not_zero(&ret->d_lockref);
+ rcu_read_unlock();
+ if (likely(gotref)) {
+ if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
+ return ret;
+ dput(ret);
+ }
+
repeat:
/*
* Don't need rcu_dereference because we re-check it was correct under
@@ -634,8 +645,8 @@ repeat:
goto repeat;
}
rcu_read_unlock();
- BUG_ON(!ret->d_count);
- ret->d_count++;
+ BUG_ON(!ret->d_lockref.count);
+ ret->d_lockref.count++;
spin_unlock(&ret->d_lock);
return ret;
}
@@ -718,7 +729,15 @@ restart:
spin_lock(&inode->i_lock);
hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
spin_lock(&dentry->d_lock);
- if (!dentry->d_count) {
+ if (!dentry->d_lockref.count) {
+ /*
+ * inform the fs via d_prune that this dentry
+ * is about to be unhashed and destroyed.
+ */
+ if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+ !d_unhashed(dentry))
+ dentry->d_op->d_prune(dentry);
+
__dget_dlock(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
@@ -744,7 +763,7 @@ static void try_prune_one_dentry(struct dentry *dentry)
{
struct dentry *parent;
- parent = dentry_kill(dentry, 0);
+ parent = dentry_kill(dentry);
/*
* If dentry_kill returns NULL, we have nothing more to do.
* if it returns the same dentry, trylocks failed. In either
@@ -763,13 +782,9 @@ static void try_prune_one_dentry(struct dentry *dentry)
/* Prune ancestors. */
dentry = parent;
while (dentry) {
- spin_lock(&dentry->d_lock);
- if (dentry->d_count > 1) {
- dentry->d_count--;
- spin_unlock(&dentry->d_lock);
+ if (lockref_put_or_lock(&dentry->d_lockref))
return;
- }
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry);
}
}
@@ -793,7 +808,7 @@ static void shrink_dentry_list(struct list_head *list)
* the LRU because of laziness during lookup. Do not free
* it - just keep it off the LRU list.
*/
- if (dentry->d_count) {
+ if (dentry->d_lockref.count) {
dentry_lru_del(dentry);
spin_unlock(&dentry->d_lock);
continue;
@@ -907,13 +922,14 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
* inform the fs that this dentry is about to be
* unhashed and destroyed.
*/
- if (dentry->d_flags & DCACHE_OP_PRUNE)
+ if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+ !d_unhashed(dentry))
dentry->d_op->d_prune(dentry);
dentry_lru_del(dentry);
__d_shrink(dentry);
- if (dentry->d_count != 0) {
+ if (dentry->d_lockref.count != 0) {
printk(KERN_ERR
"BUG: Dentry %p{i=%lx,n=%s}"
" still in use (%d)"
@@ -922,7 +938,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
dentry->d_inode ?
dentry->d_inode->i_ino : 0UL,
dentry->d_name.name,
- dentry->d_count,
+ dentry->d_lockref.count,
dentry->d_sb->s_type->name,
dentry->d_sb->s_id);
BUG();
@@ -933,7 +949,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
list_del(&dentry->d_u.d_child);
} else {
parent = dentry->d_parent;
- parent->d_count--;
+ parent->d_lockref.count--;
list_del(&dentry->d_u.d_child);
}
@@ -981,7 +997,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
dentry = sb->s_root;
sb->s_root = NULL;
- dentry->d_count--;
+ dentry->d_lockref.count--;
shrink_dcache_for_umount_subtree(dentry);
while (!hlist_bl_empty(&sb->s_anon)) {
@@ -1018,34 +1034,56 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
return new;
}
+/**
+ * enum d_walk_ret - action to talke during tree walk
+ * @D_WALK_CONTINUE: contrinue walk
+ * @D_WALK_QUIT: quit walk
+ * @D_WALK_NORETRY: quit when retry is needed
+ * @D_WALK_SKIP: skip this dentry and its children
+ */
+enum d_walk_ret {
+ D_WALK_CONTINUE,
+ D_WALK_QUIT,
+ D_WALK_NORETRY,
+ D_WALK_SKIP,
+};
-/*
- * Search for at least 1 mount point in the dentry's subdirs.
- * We descend to the next level whenever the d_subdirs
- * list is non-empty and continue searching.
- */
-
/**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
+ * d_walk - walk the dentry tree
+ * @parent: start of walk
+ * @data: data passed to @enter() and @finish()
+ * @enter: callback when first entering the dentry
+ * @finish: callback when successfully finished the walk
*
- * Return true if the parent or its subdirectories contain
- * a mount point
+ * The @enter() and @finish() callbacks are called with d_lock held.
*/
-int have_submounts(struct dentry *parent)
+static void d_walk(struct dentry *parent, void *data,
+ enum d_walk_ret (*enter)(void *, struct dentry *),
+ void (*finish)(void *))
{
struct dentry *this_parent;
struct list_head *next;
unsigned seq;
int locked = 0;
+ enum d_walk_ret ret;
+ bool retry = true;
seq = read_seqbegin(&rename_lock);
again:
this_parent = parent;
-
- if (d_mountpoint(parent))
- goto positive;
spin_lock(&this_parent->d_lock);
+
+ ret = enter(data, this_parent);
+ switch (ret) {
+ case D_WALK_CONTINUE:
+ break;
+ case D_WALK_QUIT:
+ case D_WALK_SKIP:
+ goto out_unlock;
+ case D_WALK_NORETRY:
+ retry = false;
+ break;
+ }
repeat:
next = this_parent->d_subdirs.next;
resume:
@@ -1055,12 +1093,22 @@ resume:
next = tmp->next;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- /* Have we found a mount point ? */
- if (d_mountpoint(dentry)) {
+
+ ret = enter(data, dentry);
+ switch (ret) {
+ case D_WALK_CONTINUE:
+ break;
+ case D_WALK_QUIT:
spin_unlock(&dentry->d_lock);
- spin_unlock(&this_parent->d_lock);
- goto positive;
+ goto out_unlock;
+ case D_WALK_NORETRY:
+ retry = false;
+ break;
+ case D_WALK_SKIP:
+ spin_unlock(&dentry->d_lock);
+ continue;
}
+
if (!list_empty(&dentry->d_subdirs)) {
spin_unlock(&this_parent->d_lock);
spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
@@ -1081,29 +1129,97 @@ resume:
next = child->d_u.d_child.next;
goto resume;
}
- spin_unlock(&this_parent->d_lock);
- if (!locked && read_seqretry(&rename_lock, seq))
- goto rename_retry;
- if (locked)
- write_sequnlock(&rename_lock);
- return 0; /* No mount points found in tree */
-positive:
- if (!locked && read_seqretry(&rename_lock, seq))
+ if (!locked && read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&this_parent->d_lock);
goto rename_retry;
+ }
+ if (finish)
+ finish(data);
+
+out_unlock:
+ spin_unlock(&this_parent->d_lock);
if (locked)
write_sequnlock(&rename_lock);
- return 1;
+ return;
rename_retry:
+ if (!retry)
+ return;
if (locked)
goto again;
locked = 1;
write_seqlock(&rename_lock);
goto again;
}
+
+/*
+ * Search for at least 1 mount point in the dentry's subdirs.
+ * We descend to the next level whenever the d_subdirs
+ * list is non-empty and continue searching.
+ */
+
+/**
+ * have_submounts - check for mounts over a dentry
+ * @parent: dentry to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point
+ */
+
+static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
+{
+ int *ret = data;
+ if (d_mountpoint(dentry)) {
+ *ret = 1;
+ return D_WALK_QUIT;
+ }
+ return D_WALK_CONTINUE;
+}
+
+int have_submounts(struct dentry *parent)
+{
+ int ret = 0;
+
+ d_walk(parent, &ret, check_mount, NULL);
+
+ return ret;
+}
EXPORT_SYMBOL(have_submounts);
/*
+ * Called by mount code to set a mountpoint and check if the mountpoint is
+ * reachable (e.g. NFS can unhash a directory dentry and then the complete
+ * subtree can become unreachable).
+ *
+ * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For
+ * this reason take rename_lock and d_lock on dentry and ancestors.
+ */
+int d_set_mounted(struct dentry *dentry)
+{
+ struct dentry *p;
+ int ret = -ENOENT;
+ write_seqlock(&rename_lock);
+ for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
+ /* Need exclusion wrt. check_submounts_and_drop() */
+ spin_lock(&p->d_lock);
+ if (unlikely(d_unhashed(p))) {
+ spin_unlock(&p->d_lock);
+ goto out;
+ }
+ spin_unlock(&p->d_lock);
+ }
+ spin_lock(&dentry->d_lock);
+ if (!d_unlinked(dentry)) {
+ dentry->d_flags |= DCACHE_MOUNTED;
+ ret = 0;
+ }
+ spin_unlock(&dentry->d_lock);
+out:
+ write_sequnlock(&rename_lock);
+ return ret;
+}
+
+/*
* Search the dentry child list of the specified parent,
* and move any unused dentries to the end of the unused
* list for prune_dcache(). We descend to the next level
@@ -1117,93 +1233,46 @@ EXPORT_SYMBOL(have_submounts);
* drop the lock and return early due to latency
* constraints.
*/
-static int select_parent(struct dentry *parent, struct list_head *dispose)
-{
- struct dentry *this_parent;
- struct list_head *next;
- unsigned seq;
- int found = 0;
- int locked = 0;
-
- seq = read_seqbegin(&rename_lock);
-again:
- this_parent = parent;
- spin_lock(&this_parent->d_lock);
-repeat:
- next = this_parent->d_subdirs.next;
-resume:
- while (next != &this_parent->d_subdirs) {
- struct list_head *tmp = next;
- struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
- next = tmp->next;
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+struct select_data {
+ struct dentry *start;
+ struct list_head dispose;
+ int found;
+};
- /*
- * move only zero ref count dentries to the dispose list.
- *
- * Those which are presently on the shrink list, being processed
- * by shrink_dentry_list(), shouldn't be moved. Otherwise the
- * loop in shrink_dcache_parent() might not make any progress
- * and loop forever.
- */
- if (dentry->d_count) {
- dentry_lru_del(dentry);
- } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
- dentry_lru_move_list(dentry, dispose);
- dentry->d_flags |= DCACHE_SHRINK_LIST;
- found++;
- }
- /*
- * We can return to the caller if we have found some (this
- * ensures forward progress). We'll be coming back to find
- * the rest.
- */
- if (found && need_resched()) {
- spin_unlock(&dentry->d_lock);
- goto out;
- }
+static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
+{
+ struct select_data *data = _data;
+ enum d_walk_ret ret = D_WALK_CONTINUE;
- /*
- * Descend a level if the d_subdirs list is non-empty.
- */
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&this_parent->d_lock);
- spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
- this_parent = dentry;
- spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
- goto repeat;
- }
+ if (data->start == dentry)
+ goto out;
- spin_unlock(&dentry->d_lock);
- }
/*
- * All done at this level ... ascend and resume the search.
+ * move only zero ref count dentries to the dispose list.
+ *
+ * Those which are presently on the shrink list, being processed
+ * by shrink_dentry_list(), shouldn't be moved. Otherwise the
+ * loop in shrink_dcache_parent() might not make any progress
+ * and loop forever.
*/
- if (this_parent != parent) {
- struct dentry *child = this_parent;
- this_parent = try_to_ascend(this_parent, locked, seq);
- if (!this_parent)
- goto rename_retry;
- next = child->d_u.d_child.next;
- goto resume;
+ if (dentry->d_lockref.count) {
+ dentry_lru_del(dentry);
+ } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+ dentry_lru_move_list(dentry, &data->dispose);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ data->found++;
+ ret = D_WALK_NORETRY;
}
+ /*
+ * We can return to the caller if we have found some (this
+ * ensures forward progress). We'll be coming back to find
+ * the rest.
+ */
+ if (data->found && need_resched())
+ ret = D_WALK_QUIT;
out:
- spin_unlock(&this_parent->d_lock);
- if (!locked && read_seqretry(&rename_lock, seq))
- goto rename_retry;
- if (locked)
- write_sequnlock(&rename_lock);
- return found;
-
-rename_retry:
- if (found)
- return found;
- if (locked)
- goto again;
- locked = 1;
- write_seqlock(&rename_lock);
- goto again;
+ return ret;
}
/**
@@ -1212,18 +1281,90 @@ rename_retry:
*
* Prune the dcache to remove unused children of the parent dentry.
*/
-void shrink_dcache_parent(struct dentry * parent)
+void shrink_dcache_parent(struct dentry *parent)
{
- LIST_HEAD(dispose);
- int found;
+ for (;;) {
+ struct select_data data;
+
+ INIT_LIST_HEAD(&data.dispose);
+ data.start = parent;
+ data.found = 0;
- while ((found = select_parent(parent, &dispose)) != 0) {
- shrink_dentry_list(&dispose);
+ d_walk(parent, &data, select_collect, NULL);
+ if (!data.found)
+ break;
+
+ shrink_dentry_list(&data.dispose);
cond_resched();
}
}
EXPORT_SYMBOL(shrink_dcache_parent);
+static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
+{
+ struct select_data *data = _data;
+
+ if (d_mountpoint(dentry)) {
+ data->found = -EBUSY;
+ return D_WALK_QUIT;
+ }
+
+ return select_collect(_data, dentry);
+}
+
+static void check_and_drop(void *_data)
+{
+ struct select_data *data = _data;
+
+ if (d_mountpoint(data->start))
+ data->found = -EBUSY;
+ if (!data->found)
+ __d_drop(data->start);
+}
+
+/**
+ * check_submounts_and_drop - prune dcache, check for submounts and drop
+ *
+ * All done as a single atomic operation relative to has_unlinked_ancestor().
+ * Returns 0 if successfully unhashed @parent. If there were submounts then
+ * return -EBUSY.
+ *
+ * @dentry: dentry to prune and drop
+ */
+int check_submounts_and_drop(struct dentry *dentry)
+{
+ int ret = 0;
+
+ /* Negative dentries can be dropped without further checks */
+ if (!dentry->d_inode) {
+ d_drop(dentry);
+ goto out;
+ }
+
+ for (;;) {
+ struct select_data data;
+
+ INIT_LIST_HEAD(&data.dispose);
+ data.start = dentry;
+ data.found = 0;
+
+ d_walk(dentry, &data, check_and_collect, check_and_drop);
+ ret = data.found;
+
+ if (!list_empty(&data.dispose))
+ shrink_dentry_list(&data.dispose);
+
+ if (ret <= 0)
+ break;
+
+ cond_resched();
+ }
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(check_submounts_and_drop);
+
/**
* __d_alloc - allocate a dcache entry
* @sb: filesystem it will belong to
@@ -1269,7 +1410,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
smp_wmb();
dentry->d_name.name = dname;
- dentry->d_count = 1;
+ dentry->d_lockref.count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
seqcount_init(&dentry->d_seq);
@@ -1782,7 +1923,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
* without taking d_lock and checking d_seq sequence count against @seq
* returned here.
*
- * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * A refcount may be taken on the found dentry with the d_rcu_to_refcount
* function.
*
* Alternatively, __d_lookup_rcu may be called again to look up the child of
@@ -1970,7 +2111,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
goto next;
}
- dentry->d_count++;
+ dentry->d_lockref.count++;
found = dentry;
spin_unlock(&dentry->d_lock);
break;
@@ -2069,7 +2210,7 @@ again:
spin_lock(&dentry->d_lock);
inode = dentry->d_inode;
isdir = S_ISDIR(inode->i_mode);
- if (dentry->d_count == 1) {
+ if (dentry->d_lockref.count == 1) {
if (!spin_trylock(&inode->i_lock)) {
spin_unlock(&dentry->d_lock);
cpu_relax();
@@ -2724,6 +2865,17 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
return memcpy(buffer, temp, sz);
}
+char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ char *end = buffer + buflen;
+ /* these dentries are never renamed, so d_lock is not needed */
+ if (prepend(&end, &buflen, " (deleted)", 11) ||
+ prepend_name(&end, &buflen, &dentry->d_name) ||
+ prepend(&end, &buflen, "/", 1))
+ end = ERR_PTR(-ENAMETOOLONG);
+ return end;
+}
+
/*
* Write full pathname from the root of the filesystem into the buffer.
*/
@@ -2904,68 +3056,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
return result;
}
-void d_genocide(struct dentry *root)
+static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
{
- struct dentry *this_parent;
- struct list_head *next;
- unsigned seq;
- int locked = 0;
-
- seq = read_seqbegin(&rename_lock);
-again:
- this_parent = root;
- spin_lock(&this_parent->d_lock);
-repeat:
- next = this_parent->d_subdirs.next;
-resume:
- while (next != &this_parent->d_subdirs) {
- struct list_head *tmp = next;
- struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
- next = tmp->next;
+ struct dentry *root = data;
+ if (dentry != root) {
+ if (d_unhashed(dentry) || !dentry->d_inode)
+ return D_WALK_SKIP;
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (d_unhashed(dentry) || !dentry->d_inode) {
- spin_unlock(&dentry->d_lock);
- continue;
- }
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&this_parent->d_lock);
- spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
- this_parent = dentry;
- spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
- goto repeat;
- }
if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
dentry->d_flags |= DCACHE_GENOCIDE;
- dentry->d_count--;
- }
- spin_unlock(&dentry->d_lock);
- }
- if (this_parent != root) {
- struct dentry *child = this_parent;
- if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
- this_parent->d_flags |= DCACHE_GENOCIDE;
- this_parent->d_count--;
+ dentry->d_lockref.count--;
}
- this_parent = try_to_ascend(this_parent, locked, seq);
- if (!this_parent)
- goto rename_retry;
- next = child->d_u.d_child.next;
- goto resume;
}
- spin_unlock(&this_parent->d_lock);
- if (!locked && read_seqretry(&rename_lock, seq))
- goto rename_retry;
- if (locked)
- write_sequnlock(&rename_lock);
- return;
+ return D_WALK_CONTINUE;
+}
-rename_retry:
- if (locked)
- goto again;
- locked = 1;
- write_seqlock(&rename_lock);
- goto again;
+void d_genocide(struct dentry *parent)
+{
+ d_walk(parent, parent, d_genocide_kill, NULL);
}
void d_tmpfile(struct dentry *dentry, struct inode *inode)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4888cb3fdef7..c7c83ff0f752 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
*/
void debugfs_remove_recursive(struct dentry *dentry)
{
- struct dentry *child;
- struct dentry *parent;
+ struct dentry *child, *next, *parent;
if (IS_ERR_OR_NULL(dentry))
return;
@@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry)
return;
parent = dentry;
+ down:
mutex_lock(&parent->d_inode->i_mutex);
+ list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
+ if (!debugfs_positive(child))
+ continue;
- while (1) {
- /*
- * When all dentries under "parent" has been removed,
- * walk up the tree until we reach our starting point.
- */
- if (list_empty(&parent->d_subdirs)) {
- mutex_unlock(&parent->d_inode->i_mutex);
- if (parent == dentry)
- break;
- parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
- }
- child = list_entry(parent->d_subdirs.next, struct dentry,
- d_u.d_child);
- next_sibling:
-
- /*
- * If "child" isn't empty, walk down the tree and
- * remove all its descendants first.
- */
+ /* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
mutex_unlock(&parent->d_inode->i_mutex);
parent = child;
- mutex_lock(&parent->d_inode->i_mutex);
- continue;
+ goto down;
}
- __debugfs_remove(child, parent);
- if (parent->d_subdirs.next == &child->d_u.d_child) {
- /*
- * Try the next sibling.
- */
- if (child->d_u.d_child.next != &parent->d_subdirs) {
- child = list_entry(child->d_u.d_child.next,
- struct dentry,
- d_u.d_child);
- goto next_sibling;
- }
-
- /*
- * Avoid infinite loop if we fail to remove
- * one dentry.
- */
- mutex_unlock(&parent->d_inode->i_mutex);
- break;
- }
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ up:
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
- parent = dentry->d_parent;
+ mutex_unlock(&parent->d_inode->i_mutex);
+ child = parent;
+ parent = parent->d_parent;
mutex_lock(&parent->d_inode->i_mutex);
- __debugfs_remove(dentry, parent);
+
+ if (child != dentry) {
+ next = list_entry(child->d_u.d_child.next, struct dentry,
+ d_u.d_child);
+ goto up;
+ }
+
+ if (!__debugfs_remove(child, parent))
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
mutex_unlock(&parent->d_inode->i_mutex);
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5081ee..0e04142d5962 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
+ bool defer_completion; /* defer AIO completion to workqueue? */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -141,7 +142,10 @@ struct dio {
* allocation time. Don't add new fields after pages[] unless you
* wish that they not be zeroed.
*/
- struct page *pages[DIO_PAGES]; /* page buffer */
+ union {
+ struct page *pages[DIO_PAGES]; /* page buffer */
+ struct work_struct complete_work;/* deferred AIO completion */
+ };
} ____cacheline_aligned_in_smp;
static struct kmem_cache *dio_cache __read_mostly;
@@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio,
* dio_complete() - called when all DIO BIO I/O has been completed
* @offset: the byte offset in the file of the completed operation
*
- * This releases locks as dictated by the locking type, lets interested parties
- * know that a DIO operation has completed, and calculates the resulting return
- * code for the operation.
+ * This drops i_dio_count, lets interested parties know that a DIO operation
+ * has completed, and calculates the resulting return code for the operation.
*
* It lets the filesystem know if it registered an interest earlier via
* get_block. Pass the private field of the map buffer_head so that
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
+ bool is_async)
{
ssize_t transferred = 0;
@@ -258,19 +262,36 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
if (ret == 0)
ret = transferred;
- if (dio->end_io && dio->result) {
- dio->end_io(dio->iocb, offset, transferred,
- dio->private, ret, is_async);
- } else {
- inode_dio_done(dio->inode);
- if (is_async)
- aio_complete(dio->iocb, ret, 0);
+ if (dio->end_io && dio->result)
+ dio->end_io(dio->iocb, offset, transferred, dio->private);
+
+ inode_dio_done(dio->inode);
+ if (is_async) {
+ if (dio->rw & WRITE) {
+ int err;
+
+ err = generic_write_sync(dio->iocb->ki_filp, offset,
+ transferred);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+
+ aio_complete(dio->iocb, ret, 0);
}
+ kmem_cache_free(dio_cache, dio);
return ret;
}
+static void dio_aio_complete_work(struct work_struct *work)
+{
+ struct dio *dio = container_of(work, struct dio, complete_work);
+
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+}
+
static int dio_bio_complete(struct dio *dio, struct bio *bio);
+
/*
* Asynchronous IO callback.
*/
@@ -290,8 +311,13 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
- kmem_cache_free(dio_cache, dio);
+ if (dio->result && dio->defer_completion) {
+ INIT_WORK(&dio->complete_work, dio_aio_complete_work);
+ queue_work(dio->inode->i_sb->s_dio_done_wq,
+ &dio->complete_work);
+ } else {
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ }
}
}
@@ -511,6 +537,42 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
}
/*
+ * Create workqueue for deferred direct IO completions. We allocate the
+ * workqueue when it's first needed. This avoids creating workqueue for
+ * filesystems that don't need it and also allows us to create the workqueue
+ * late enough so the we can include s_id in the name of the workqueue.
+ */
+static int sb_init_dio_done_wq(struct super_block *sb)
+{
+ struct workqueue_struct *old;
+ struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+ WQ_MEM_RECLAIM, 0,
+ sb->s_id);
+ if (!wq)
+ return -ENOMEM;
+ /*
+ * This has to be atomic as more DIOs can race to create the workqueue
+ */
+ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
+ /* Someone created workqueue before us? Free ours... */
+ if (old)
+ destroy_workqueue(wq);
+ return 0;
+}
+
+static int dio_set_defer_completion(struct dio *dio)
+{
+ struct super_block *sb = dio->inode->i_sb;
+
+ if (dio->defer_completion)
+ return 0;
+ dio->defer_completion = true;
+ if (!sb->s_dio_done_wq)
+ return sb_init_dio_done_wq(sb);
+ return 0;
+}
+
+/*
* Call into the fs to map some more disk blocks. We record the current number
* of available blocks at sdio->blocks_available. These are in units of the
* fs blocksize, (1 << inode->i_blkbits).
@@ -581,6 +643,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
/* Store for completion */
dio->private = map_bh->b_private;
+
+ if (ret == 0 && buffer_defer_completion(map_bh))
+ ret = dio_set_defer_completion(dio);
}
return ret;
}
@@ -1129,11 +1194,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
}
/*
- * Will be decremented at I/O completion time.
- */
- atomic_inc(&inode->i_dio_count);
-
- /*
* For file extending writes updating i_size before data
* writeouts complete can expose uninitialized blocks. So
* even for AIO, we need to wait for i/o to complete before
@@ -1141,11 +1201,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
*/
dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
(end > i_size_read(inode)));
-
- retval = 0;
-
dio->inode = inode;
dio->rw = rw;
+
+ /*
+ * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
+ * so that we can call ->fsync.
+ */
+ if (dio->is_async && (rw & WRITE) &&
+ ((iocb->ki_filp->f_flags & O_DSYNC) ||
+ IS_SYNC(iocb->ki_filp->f_mapping->host))) {
+ retval = dio_set_defer_completion(dio);
+ if (retval) {
+ /*
+ * We grab i_mutex only for reads so we don't have
+ * to release it here
+ */
+ kmem_cache_free(dio_cache, dio);
+ goto out;
+ }
+ }
+
+ /*
+ * Will be decremented at I/O completion time.
+ */
+ atomic_inc(&inode->i_dio_count);
+
+ retval = 0;
sdio.blkbits = blkbits;
sdio.blkfactor = i_blkbits - blkbits;
sdio.block_in_file = offset >> blkbits;
@@ -1269,7 +1351,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (drop_refcount(dio) == 0) {
retval = dio_complete(dio, offset, retval, false);
- kmem_cache_free(dio_cache, dio);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 27a6ba9aaeec..0e90f0c91b93 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -267,10 +267,7 @@ void dlm_callback_work(struct work_struct *work)
int dlm_callback_start(struct dlm_ls *ls)
{
ls->ls_callback_wq = alloc_workqueue("dlm_callback",
- WQ_UNBOUND |
- WQ_MEM_RECLAIM |
- WQ_NON_REENTRANT,
- 0);
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
if (!ls->ls_callback_wq) {
log_print("can't start dlm_callback workqueue");
return -ENOMEM;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 911649a47dd5..142e21655eed 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -493,7 +493,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
{
struct dlm_user_proc *proc = file->private_data;
struct dlm_write_request *kbuf;
- sigset_t tmpsig, allsigs;
int error;
#ifdef CONFIG_COMPAT
@@ -557,9 +556,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
goto out_free;
}
- sigfillset(&allsigs);
- sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
-
error = -EINVAL;
switch (kbuf->cmd)
@@ -567,7 +563,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
case DLM_USER_LOCK:
if (!proc) {
log_print("no locking on control device");
- goto out_sig;
+ goto out_free;
}
error = device_user_lock(proc, &kbuf->i.lock);
break;
@@ -575,7 +571,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
case DLM_USER_UNLOCK:
if (!proc) {
log_print("no locking on control device");
- goto out_sig;
+ goto out_free;
}
error = device_user_unlock(proc, &kbuf->i.lock);
break;
@@ -583,7 +579,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
case DLM_USER_DEADLOCK:
if (!proc) {
log_print("no locking on control device");
- goto out_sig;
+ goto out_free;
}
error = device_user_deadlock(proc, &kbuf->i.lock);
break;
@@ -591,7 +587,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
case DLM_USER_CREATE_LOCKSPACE:
if (proc) {
log_print("create/remove only on control device");
- goto out_sig;
+ goto out_free;
}
error = device_create_lockspace(&kbuf->i.lspace);
break;
@@ -599,7 +595,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
case DLM_USER_REMOVE_LOCKSPACE:
if (proc) {
log_print("create/remove only on control device");
- goto out_sig;
+ goto out_free;
}
error = device_remove_lockspace(&kbuf->i.lspace);
break;
@@ -607,7 +603,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
case DLM_USER_PURGE:
if (!proc) {
log_print("no locking on control device");
- goto out_sig;
+ goto out_free;
}
error = device_user_purge(proc, &kbuf->i.purge);
break;
@@ -617,8 +613,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
kbuf->cmd);
}
- out_sig:
- sigprocmask(SIG_SETMASK, &tmpsig, NULL);
out_free:
kfree(kbuf);
return error;
@@ -659,15 +653,11 @@ static int device_close(struct inode *inode, struct file *file)
{
struct dlm_user_proc *proc = file->private_data;
struct dlm_ls *ls;
- sigset_t tmpsig, allsigs;
ls = dlm_find_lockspace_local(proc->lockspace);
if (!ls)
return -ENOENT;
- sigfillset(&allsigs);
- sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
-
set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
dlm_clear_proc_locks(ls, proc);
@@ -685,9 +675,6 @@ static int device_close(struct inode *inode, struct file *file)
/* FIXME: AUTOFREE: if this ls is no longer used do
device_remove_lockspace() */
- sigprocmask(SIG_SETMASK, &tmpsig, NULL);
- recalc_sigpending();
-
return 0;
}
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index f3913eb2c474..d15ccf20f1b3 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -57,7 +57,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
struct inode *inode;
inode = iget_locked(super, ino);
- if (IS_ERR(inode))
+ if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9ad17b15b454..293f86741ddb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1792,7 +1792,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
{
int error;
int did_lock_epmutex = 0;
- struct file *file, *tfile;
+ struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
@@ -1802,20 +1802,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
- /* Get the "struct file *" for the eventpoll file */
error = -EBADF;
- file = fget(epfd);
- if (!file)
+ f = fdget(epfd);
+ if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
- tfile = fget(fd);
- if (!tfile)
+ tf = fdget(fd);
+ if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
- if (!tfile->f_op || !tfile->f_op->poll)
+ if (!tf.file->f_op || !tf.file->f_op->poll)
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
@@ -1828,14 +1827,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
- if (file == tfile || !is_file_epoll(file))
+ if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = file->private_data;
+ ep = f.file->private_data;
/*
* When we insert an epoll file descriptor, inside another epoll file
@@ -1854,14 +1853,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
did_lock_epmutex = 1;
}
if (op == EPOLL_CTL_ADD) {
- if (is_file_epoll(tfile)) {
+ if (is_file_epoll(tf.file)) {
error = -ELOOP;
- if (ep_loop_check(ep, tfile) != 0) {
+ if (ep_loop_check(ep, tf.file) != 0) {
clear_tfile_check_list();
goto error_tgt_fput;
}
} else
- list_add(&tfile->f_tfile_llink, &tfile_check_list);
+ list_add(&tf.file->f_tfile_llink, &tfile_check_list);
}
mutex_lock_nested(&ep->mtx, 0);
@@ -1871,14 +1870,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
- epi = ep_find(ep, tfile, fd);
+ epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tfile, fd);
+ error = ep_insert(ep, &epds, tf.file, fd);
} else
error = -EEXIST;
clear_tfile_check_list();
@@ -1903,9 +1902,9 @@ error_tgt_fput:
if (did_lock_epmutex)
mutex_unlock(&epmutex);
- fput(tfile);
+ fdput(tf);
error_fput:
- fput(file);
+ fdput(f);
error_return:
return error;
diff --git a/fs/exec.c b/fs/exec.c
index 9c73def87642..fd774c7cb483 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -608,7 +608,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
return -ENOMEM;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, old_start, old_end);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
@@ -625,7 +625,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
}
- tlb_finish_mmu(&tlb, new_end, old_end);
+ tlb_finish_mmu(&tlb, old_start, old_end);
/*
* Shrink the vma to just the new range. Always succeeds.
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index f522425aaa24..bafdd48eefde 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -41,7 +41,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
/**
* Check if the given dir-inode refers to an htree-indexed directory
- * (or a directory which chould potentially get coverted to use htree
+ * (or a directory which could potentially get converted to use htree
* indexing).
*
* Return 1 if it is a dx dir, 0 if not
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 998ea111e537..1194b1f0f839 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1780,11 +1780,11 @@ retry:
inode->i_op = &ext3_file_inode_operations;
inode->i_fop = &ext3_file_operations;
ext3_set_aops(inode);
+ d_tmpfile(dentry, inode);
err = ext3_orphan_add(handle, inode);
if (err)
goto err_drop_inode;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
unlock_new_inode(inode);
}
ext3_journal_stop(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c47f14750722..c50c76190373 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,6 +27,7 @@
#include <linux/seq_file.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/namei.h>
#include <asm/uaccess.h>
@@ -819,6 +820,7 @@ enum {
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_journal_path,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -860,6 +862,7 @@ static const match_table_t tokens = {
{Opt_journal_update, "journal=update"},
{Opt_journal_inum, "journal=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_abort, "abort"},
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
@@ -975,6 +978,11 @@ static int parse_options (char *options, struct super_block *sb,
int option;
kuid_t uid;
kgid_t gid;
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
#ifdef CONFIG_QUOTA
int qfmt;
#endif
@@ -1129,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb,
return 0;
*journal_devnum = option;
break;
+ case Opt_journal_path:
+ if (is_remount) {
+ ext3_msg(sb, KERN_ERR, "error: cannot specify "
+ "journal on remount");
+ return 0;
+ }
+
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext3_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return 0;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext3_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return 0;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext3_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return 0;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
+ break;
case Opt_noload:
set_opt (sbi->s_mount_opt, NOLOAD);
break;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 58339393fa6e..dc5d572ebd6a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_group_t group;
if (test_opt2(sb, STD_GROUP_SIZE))
- group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- block) >>
+ group = (block -
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
else
ext4_get_group_no_and_offset(sb, block, &group, NULL);
@@ -184,6 +184,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
int flex_bg = 0;
+ struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
@@ -191,11 +192,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
- ext4_free_group_clusters_set(sb, gdp, 0);
- ext4_free_inodes_set(sb, gdp, 0);
- ext4_itable_unused_set(sb, gdp, 0);
- memset(bh->b_data, 0xff, sb->s_blocksize);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
+ grp = ext4_get_group_info(sb, block_group);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -305,7 +304,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
*/
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
- unsigned int block_group,
+ ext4_group_t block_group,
struct buffer_head *bh)
{
ext4_grpblk_t offset;
@@ -352,10 +351,11 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
- unsigned int block_group,
+ ext4_group_t block_group,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return;
@@ -366,12 +366,14 @@ void ext4_validate_block_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
set_buffer_verified(bh);
@@ -445,7 +447,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
- return bh;
+ if (buffer_verified(bh))
+ return bh;
+ put_bh(bh);
+ return NULL;
}
/* Returns 0 on success, 1 on error */
@@ -469,7 +474,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
ext4_validate_block_bitmap(sb, desc, block_group, bh);
- return 0;
+ /* ...but check for error just in case errors=continue. */
+ return !buffer_verified(bh);
}
struct buffer_head *
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3c7d288ae94c..680bb3388919 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,7 +33,7 @@ static int ext4_dx_readdir(struct file *, struct dir_context *);
/**
* Check if the given dir-inode refers to an htree-indexed directory
- * (or a directory which chould potentially get coverted to use htree
+ * (or a directory which could potentially get converted to use htree
* indexing).
*
* Return 1 if it is a dx dir, 0 if not
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b577e45425b0..af815ea9d7cc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -180,7 +180,6 @@ struct ext4_map_blocks {
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_DIRECT 0x0002
/*
* For converting uninitialized extents on a work queue. 'handle' is used for
@@ -196,8 +195,6 @@ typedef struct ext4_io_end {
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct kiocb *iocb; /* iocb struct for AIO */
- int result; /* error value for AIO */
atomic_t count; /* reference counter */
} ext4_io_end_t;
@@ -561,6 +558,18 @@ enum {
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE 0x0400
+#define EXT4_EX_FORCE_CACHE 0x0800
+
+/*
* Flags used by ext4_free_blocks
*/
#define EXT4_FREE_BLOCKS_METADATA 0x0001
@@ -569,6 +578,7 @@ enum {
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+#define EXT4_FREE_BLOCKS_RESERVE 0x0040
/*
* ioctl commands
@@ -590,6 +600,7 @@ enum {
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -900,11 +911,9 @@ struct ext4_inode_info {
* Completed IOs that need unwritten extents handling and don't have
* transaction reserved
*/
- struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
- struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -1276,8 +1285,6 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for unreserved extent convertions (dio) */
- struct workqueue_struct *unrsv_conversion_wq;
/* workqueue for reserved extent conversions (buffered io) */
struct workqueue_struct *rsv_conversion_wq;
@@ -1340,9 +1347,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- /* Writeback has to have coversion transaction reserved */
- WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
- !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -1375,6 +1379,7 @@ enum {
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
+ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1915,7 +1920,7 @@ extern ext4_group_t ext4_get_group_number(struct super_block *sb,
extern void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
- unsigned int block_group,
+ ext4_group_t block_group,
struct buffer_head *bh);
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
@@ -2086,6 +2091,7 @@ extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
@@ -2416,16 +2422,32 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
- /*
- * XXX: replace with spinlock if seen contended -bzzz
- */
+ WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+ !mutex_is_locked(&inode->i_mutex));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = newsize;
up_write(&EXT4_I(inode)->i_data_sem);
- return ;
+}
+
+/*
+ * Update i_disksize after writeback has been started. Races with truncate
+ * are avoided by checking i_size under i_data_sem.
+ */
+static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (newsize > i_size)
+ newsize = i_size;
+ if (newsize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = newsize;
+ up_write(&EXT4_I(inode)->i_data_sem);
}
struct ext4_group_info {
@@ -2448,9 +2470,15 @@ struct ext4_group_info {
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
@@ -2654,6 +2682,12 @@ extern int ext4_check_blockref(const char *, unsigned int,
struct ext4_ext_path;
struct ext4_extent;
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff
+
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
@@ -2683,7 +2717,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
struct ext4_ext_path *,
struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *);
+ struct ext4_ext_path *,
+ int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern int ext4_find_delalloc_range(struct inode *inode,
@@ -2692,7 +2727,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
-
+extern int ext4_ext_precache(struct inode *inode);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2715,7 +2750,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
-extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 51bc821ade90..5074fe23f19e 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -134,12 +134,6 @@ struct ext4_ext_path {
*/
/*
- * Maximum number of logical blocks in a file; ext4_extent's ee_block is
- * __le32.
- */
-#define EXT_MAX_BLOCKS 0xffffffff
-
-/*
* EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
* initialized extent. This is 2^15 and not (2^16 - 1), since we use the
* MSB of ee_len field in the extent datastructure to signify if this
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 72a3600aedbd..17ac112ab101 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -255,10 +255,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
- if (err) {
- /* Errors can only happen if there is a bug */
- handle->h_err = err;
- __ext4_journal_stop(where, line, handle);
+ /* Errors can only happen if there is a bug */
+ if (WARN_ON_ONCE(err)) {
+ ext4_journal_abort_handle(where, line, __func__, bh,
+ handle, err);
}
} else {
if (inode)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 2877258d9497..81cfefa9dc0c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -197,7 +197,7 @@ static inline void ext4_journal_callback_add(handle_t *handle,
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
- * Return true if object was sucessfully removed
+ * Return true if object was successfully removed
*/
static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7097b0f680e6..54d52afcdb19 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -407,7 +407,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth)
+ int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
int max = 0;
@@ -447,42 +447,149 @@ static int __ext4_ext_check(const char *function, unsigned int line,
corrupted:
ext4_error_inode(inode, function, line, 0,
- "bad header/extent: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- error_msg, le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
-
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return -EIO;
}
-#define ext4_ext_check(inode, eh, depth) \
- __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth, pblk) \
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
int ext4_ext_check_inode(struct inode *inode)
{
- return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
-static int __ext4_ext_check_block(const char *function, unsigned int line,
- struct inode *inode,
- struct ext4_extent_header *eh,
- int depth,
- struct buffer_head *bh)
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+ struct inode *inode, ext4_fsblk_t pblk, int depth,
+ int flags)
{
- int ret;
+ struct buffer_head *bh;
+ int err;
- if (buffer_verified(bh))
- return 0;
- ret = ext4_ext_check(inode, eh, depth);
- if (ret)
- return ret;
+ bh = sb_getblk(inode->i_sb, pblk);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
+
+ if (!bh_uptodate_or_lock(bh)) {
+ trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+ err = bh_submit_read(bh);
+ if (err < 0)
+ goto errout;
+ }
+ if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+ return bh;
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
set_buffer_verified(bh);
- return ret;
+ /*
+ * If this is a leaf block, cache all of its entries
+ */
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+ struct ext4_extent_header *eh = ext_block_hdr(bh);
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev,
+ lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_uninitialized(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+ }
+ return bh;
+errout:
+ put_bh(bh);
+ return ERR_PTR(err);
+
}
-#define ext4_ext_check_block(inode, eh, depth, bh) \
- __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
+#define read_extent_tree_block(inode, pblk, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+ (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_ext_path *path = NULL;
+ struct buffer_head *bh;
+ int i = 0, depth, ret = 0;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return 0; /* not an extent-mapped inode */
+
+ down_read(&ei->i_data_sem);
+ depth = ext_depth(inode);
+
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ up_read(&ei->i_data_sem);
+ return -ENOMEM;
+ }
+
+ /* Don't cache anything if there are no external extent blocks */
+ if (depth == 0)
+ goto out;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+ if (ret)
+ goto out;
+ path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+ while (i >= 0) {
+ /*
+ * If this is a leaf block or we've reached the end of
+ * the index block, go up
+ */
+ if ((i == depth) ||
+ path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx++),
+ depth - i - 1,
+ EXT4_EX_FORCE_CACHE);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
+ break;
+ }
+ i++;
+ path[i].p_bh = bh;
+ path[i].p_hdr = ext_block_hdr(bh);
+ path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+ up_read(&ei->i_data_sem);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -716,7 +823,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
struct ext4_ext_path *
ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
@@ -748,20 +855,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_getblk(inode->i_sb, path[ppos].p_block);
- if (unlikely(!bh)) {
- ret = -ENOMEM;
+ bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+ flags);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
goto err;
}
- if (!bh_uptodate_or_lock(bh)) {
- trace_ext4_ext_load_extent(inode, block,
- path[ppos].p_block);
- ret = bh_submit_read(bh);
- if (ret < 0) {
- put_bh(bh);
- goto err;
- }
- }
+
eh = ext_block_hdr(bh);
ppos++;
if (unlikely(ppos > depth)) {
@@ -773,11 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
- i--;
-
- ret = ext4_ext_check_block(inode, eh, i, bh);
- if (ret < 0)
- goto err;
}
path[ppos].p_depth = i;
@@ -1198,7 +1293,8 @@ out:
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- unsigned int flags,
+ unsigned int mb_flags,
+ unsigned int gb_flags,
struct ext4_ext_path *path,
struct ext4_extent *newext)
{
@@ -1220,7 +1316,7 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, flags, path, newext, i);
+ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
goto out;
@@ -1228,12 +1324,12 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path))
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, flags, newext);
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
if (err)
goto out;
@@ -1241,7 +1337,7 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -1412,29 +1508,21 @@ got_index:
ix++;
block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
- eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check_block(inode, eh,
- path->p_depth - depth, bh)) {
- put_bh(bh);
- return -EIO;
- }
+ bh = read_extent_tree_block(inode, block,
+ path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+ eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
+ bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = ext_block_hdr(bh);
- if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
- put_bh(bh);
- return -EIO;
- }
ex = EXT_FIRST_EXTENT(eh);
found_extent:
*logical = le32_to_cpu(ex->ee_block);
@@ -1705,7 +1793,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
brelse(path[1].p_bh);
ext4_free_blocks(handle, inode, NULL, blk, 1,
- EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
+ EXT4_FREE_BLOCKS_RESERVE);
}
/*
@@ -1793,7 +1882,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext, int flag)
+ struct ext4_extent *newext, int gb_flags)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1802,7 +1891,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err;
ext4_lblk_t next;
unsigned uninitialized = 0;
- int flags = 0;
+ int mb_flags = 0;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1817,7 +1906,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
/* try to insert block into found extent and return */
- if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
/*
* Try to see whether we should rather test the extent on
@@ -1920,7 +2009,7 @@ prepend:
if (next != EXT_MAX_BLOCKS) {
ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_ext_find_extent(inode, next, NULL);
+ npath = ext4_ext_find_extent(inode, next, NULL, 0);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
@@ -1939,9 +2028,10 @@ prepend:
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
- flags = EXT4_MB_USE_RESERVED;
- err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
+ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ mb_flags = EXT4_MB_USE_RESERVED;
+ err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -2007,7 +2097,7 @@ has_space:
merge:
/* try to merge extents */
- if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(handle, inode, path, nearex);
@@ -2050,7 +2140,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
path = NULL;
}
- path = ext4_ext_find_extent(inode, block, path);
+ path = ext4_ext_find_extent(inode, block, path, 0);
if (IS_ERR(path)) {
up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path);
@@ -2195,8 +2285,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_lblk_t block)
{
int depth = ext_depth(inode);
- unsigned long len;
- ext4_lblk_t lblock;
+ unsigned long len = 0;
+ ext4_lblk_t lblock = 0;
struct ext4_extent *ex;
ex = path[depth].p_ext;
@@ -2233,7 +2323,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_es_insert_extent(inode, lblock, len, ~0,
EXTENT_STATUS_HOLE);
} else {
- lblock = len = 0;
BUG();
}
@@ -2712,7 +2801,7 @@ again:
ext4_lblk_t ee_block;
/* find extent for this block */
- path = ext4_ext_find_extent(inode, end, NULL);
+ path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2754,6 +2843,7 @@ again:
*/
err = ext4_split_extent_at(handle, inode, path,
end + 1, split_flag,
+ EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@@ -2782,7 +2872,7 @@ again:
path[0].p_hdr = ext_inode_hdr(inode);
i = 0;
- if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+ if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
err = -EIO;
goto out;
}
@@ -2829,21 +2919,21 @@ again:
ext_debug("move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
- if (!bh) {
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(bh)) {
/* should we reset i_size? */
- err = -EIO;
+ err = PTR_ERR(bh);
break;
}
+ /* Yield here to deal with large extent trees.
+ * Should be a no-op if we did IO above. */
+ cond_resched();
if (WARN_ON(i + 1 > depth)) {
err = -EIO;
break;
}
- if (ext4_ext_check_block(inode, ext_block_hdr(bh),
- depth - i - 1, bh)) {
- err = -EIO;
- break;
- }
path[i + 1].p_bh = bh;
/* save actual number of indexes since this
@@ -2958,6 +3048,23 @@ void ext4_ext_release(struct super_block *sb)
#endif
}
+static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+{
+ ext4_lblk_t ee_block;
+ ext4_fsblk_t ee_pblock;
+ unsigned int ee_len;
+
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_pblock = ext4_ext_pblock(ex);
+
+ if (ee_len == 0)
+ return 0;
+
+ return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+ EXTENT_STATUS_WRITTEN);
+}
+
/* FIXME!! we need to try to merge to left or right after zero-out */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
@@ -3110,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle,
goto fix_extent_len;
/* update extent status tree */
- err = ext4_es_zeroout(inode, &zero_ex);
+ err = ext4_zeroout_es(inode, &zero_ex);
goto out;
} else if (err)
@@ -3130,7 +3237,7 @@ fix_extent_len:
* ext4_split_extents() splits an extent and mark extent which is covered
* by @map as split_flags indicates
*
- * It may result in splitting the extent into multiple extents (upto three)
+ * It may result in splitting the extent into multiple extents (up to three)
* There are three possibilities:
* a> There is no split required
* b> Splits in two extents: Split is happening at either end of the extent
@@ -3178,7 +3285,7 @@ static int ext4_split_extent(handle_t *handle,
* result in split of original leaf or extent zeroout.
*/
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = ext_depth(inode);
@@ -3461,7 +3568,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
out:
/* If we have gotten a failure, don't zero out status tree */
if (!err)
- err = ext4_es_zeroout(inode, &zero_ex);
+ err = ext4_zeroout_es(inode, &zero_ex);
return err ? err : allocated;
}
@@ -3562,7 +3669,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
if (err < 0)
goto out;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4049,7 +4156,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -4261,8 +4368,8 @@ got_allocated_blocks:
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
- ext4_ext_get_actual_len(&newex), fb_flags);
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters), fb_flags);
goto out2;
}
@@ -4382,8 +4489,9 @@ out2:
}
out3:
- trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
-
+ trace_ext4_ext_map_blocks_exit(inode, flags, map,
+ err ? err : allocated);
+ ext4_es_lru_add(inode);
return err ? err : allocated;
}
@@ -4405,9 +4513,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
+retry:
err = ext4_es_remove_extent(inode, last_block,
EXT_MAX_BLOCKS - last_block);
+ if (err == -ENOMEM) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ return;
+ }
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ ext4_std_error(inode->i_sb, err);
}
static void ext4_falloc_update_inode(struct inode *inode,
@@ -4729,6 +4848,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return error;
}
+ if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ error = ext4_ext_precache(inode);
+ if (error)
+ return error;
+ }
+
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
@@ -4756,6 +4881,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
error = ext4_fill_fiemap_extents(inode, start_blk,
len_blks, fieinfo);
}
-
+ ext4_es_lru_add(inode);
return error;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ee018d5f397e..2d1bdbe78c04 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -13,7 +13,6 @@
#include <linux/list_sort.h>
#include "ext4.h"
#include "extents_status.h"
-#include "ext4_extents.h"
#include <trace/events/ext4.h>
@@ -148,6 +147,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end);
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei);
int __init ext4_init_es(void)
{
@@ -261,7 +262,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
if (tree->cache_es) {
es1 = tree->cache_es;
if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u) %llu %llx\n",
+ es_debug("%u cached by [%u/%u) %llu %x\n",
lblk, es1->es_lblk, es1->es_len,
ext4_es_pblock(es1), ext4_es_status(es1));
goto out;
@@ -407,6 +408,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
}
#ifdef ES_AGGRESSIVE_TEST
+#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
+
static void ext4_es_insert_extent_ext_check(struct inode *inode,
struct extent_status *es)
{
@@ -417,7 +420,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
unsigned short ee_len;
int depth, ee_status, es_status;
- path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+ path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path))
return;
@@ -439,7 +442,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
*/
if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
if (in_range(es->es_lblk, ee_block, ee_len)) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu we can find an extent "
"at block [%d/%d/%llu/%c], but we "
"want to add an delayed/hole extent "
@@ -458,7 +461,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
*/
if (es->es_lblk < ee_block ||
ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"ex_status [%d/%d/%llu/%c] != "
"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
@@ -468,7 +471,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
if (ee_status ^ es_status) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"ex_status [%d/%d/%llu/%c] != "
"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
ee_block, ee_len, ee_start,
@@ -481,7 +484,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
* that we don't want to add an written/unwritten extent.
*/
if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"can't find an extent at block %d but we want "
"to add an written/unwritten extent "
"[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +522,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
* We want to add a delayed/hole extent but this
* block has been allocated.
*/
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"We can find blocks but we want to add a "
"delayed/hole extent [%d/%d/%llu/%llx]\n",
inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +530,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
return;
} else if (ext4_es_is_written(es)) {
if (retval != es->es_len) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu retval %d != es_len %d\n",
inode->i_ino, retval, es->es_len);
return;
}
if (map.m_pblk != ext4_es_pblock(es)) {
- pr_warn("ES insert assertation failed for "
+ pr_warn("ES insert assertion failed for "
"inode: %lu m_pblk %llu != "
"es_pblk %llu\n",
inode->i_ino, map.m_pblk,
@@ -549,7 +552,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
}
} else if (retval == 0) {
if (ext4_es_is_written(es)) {
- pr_warn("ES insert assertation failed for inode: %lu "
+ pr_warn("ES insert assertion failed for inode: %lu "
"We can't find the block but we want to add "
"an written extent [%d/%d/%llu/%llx]\n",
inode->i_ino, es->es_lblk, es->es_len,
@@ -632,22 +635,20 @@ out:
}
/*
- * ext4_es_insert_extent() adds a space to a extent status tree.
- *
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
*
* Return 0 on success, error code on failure.
*/
int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned long long status)
+ unsigned int status)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err = 0;
- es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+ es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
if (!len)
@@ -667,7 +668,13 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
err = __es_remove_extent(inode, lblk, end);
if (err != 0)
goto error;
+retry:
err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
+ if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+ err = 0;
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -678,6 +685,38 @@ error:
}
/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
+{
+ struct extent_status *es;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock(&newes, pblk);
+ ext4_es_store_status(&newes, status);
+ trace_ext4_es_cache_extent(inode, &newes);
+
+ if (!len)
+ return;
+
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+ if (!es || es->es_lblk > end)
+ __es_insert_extent(inode, &newes);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
* ext4_es_lookup_extent() looks up an extent in extent status tree.
*
* ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
@@ -746,8 +785,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err = 0;
+ int err;
+retry:
+ err = 0;
es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
@@ -782,6 +823,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (err) {
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
+ if ((err == -ENOMEM) &&
+ __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+ EXT4_I(inode)))
+ goto retry;
goto out;
}
} else {
@@ -859,23 +904,6 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
return err;
}
-int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
-{
- ext4_lblk_t ee_block;
- ext4_fsblk_t ee_pblock;
- unsigned int ee_len;
-
- ee_block = le32_to_cpu(ex->ee_block);
- ee_len = ext4_ext_get_actual_len(ex);
- ee_pblock = ext4_ext_pblock(ex);
-
- if (ee_len == 0)
- return 0;
-
- return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN);
-}
-
static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
struct list_head *b)
{
@@ -883,6 +911,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
eia = list_entry(a, struct ext4_inode_info, i_es_lru);
eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+ if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return 1;
+ if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return -1;
if (eia->i_touch_when == eib->i_touch_when)
return 0;
if (time_after(eia->i_touch_when, eib->i_touch_when))
@@ -891,34 +925,18 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
return -1;
}
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+ struct ext4_inode_info *locked_ei)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
struct ext4_inode_info *ei;
struct list_head *cur, *tmp;
- LIST_HEAD(skiped);
- int nr_to_scan = sc->nr_to_scan;
+ LIST_HEAD(skipped);
int ret, nr_shrunk = 0;
-
- ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
- trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
-
- if (!nr_to_scan)
- return ret;
+ int retried = 0, skip_precached = 1, nr_skipped = 0;
spin_lock(&sbi->s_es_lru_lock);
- /*
- * If the inode that is at the head of LRU list is newer than
- * last_sorted time, that means that we need to sort this list.
- */
- ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
- if (sbi->s_es_last_sorted < ei->i_touch_when) {
- list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
- sbi->s_es_last_sorted = jiffies;
- }
-
+retry:
list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
/*
* If we have already reclaimed all extents from extent
@@ -929,13 +947,20 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- /* Skip the inode that is newer than the last_sorted time */
- if (sbi->s_es_last_sorted < ei->i_touch_when) {
- list_move_tail(cur, &skiped);
+ /*
+ * Skip the inode that is newer than the last_sorted
+ * time. Normally we try hard to avoid shrinking
+ * precached inodes, but we will as a last resort.
+ */
+ if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+ (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))) {
+ nr_skipped++;
+ list_move_tail(cur, &skipped);
continue;
}
- if (ei->i_es_lru_nr == 0)
+ if (ei->i_es_lru_nr == 0 || ei == locked_ei)
continue;
write_lock(&ei->i_es_lock);
@@ -951,9 +976,52 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
}
/* Move the newer inodes into the tail of the LRU list. */
- list_splice_tail(&skiped, &sbi->s_es_lru);
+ list_splice_tail(&skipped, &sbi->s_es_lru);
+ INIT_LIST_HEAD(&skipped);
+
+ /*
+ * If we skipped any inodes, and we weren't able to make any
+ * forward progress, sort the list and try again.
+ */
+ if ((nr_shrunk == 0) && nr_skipped && !retried) {
+ retried++;
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+ i_es_lru);
+ /*
+ * If there are no non-precached inodes left on the
+ * list, start releasing precached extents.
+ */
+ if (ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))
+ skip_precached = 0;
+ goto retry;
+ }
+
spin_unlock(&sbi->s_es_lru_lock);
+ if (locked_ei && nr_shrunk == 0)
+ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
+ return nr_shrunk;
+}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk;
+
+ ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+ if (!nr_to_scan)
+ return ret;
+
+ nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
return ret;
@@ -1009,10 +1077,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
struct rb_node *node;
struct extent_status *es;
int nr_shrunk = 0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (ei->i_es_lru_nr == 0)
return 0;
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+ __ratelimit(&_rs))
+ ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
node = rb_first(&tree->root);
while (node != NULL) {
es = rb_entry(node, struct extent_status, rb_node);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index e936730cc5b0..167f4ab8ecc3 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,16 +29,26 @@
/*
* These flags live in the high bits of extent_status.es_pblk
*/
-#define EXTENT_STATUS_WRITTEN (1ULL << 63)
-#define EXTENT_STATUS_UNWRITTEN (1ULL << 62)
-#define EXTENT_STATUS_DELAYED (1ULL << 61)
-#define EXTENT_STATUS_HOLE (1ULL << 60)
+#define ES_SHIFT 60
+
+#define EXTENT_STATUS_WRITTEN (1 << 3)
+#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+#define EXTENT_STATUS_DELAYED (1 << 1)
+#define EXTENT_STATUS_HOLE (1 << 0)
#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE)
+#define ES_WRITTEN (1ULL << 63)
+#define ES_UNWRITTEN (1ULL << 62)
+#define ES_DELAYED (1ULL << 61)
+#define ES_HOLE (1ULL << 60)
+
+#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
+ ES_DELAYED | ES_HOLE)
+
struct ext4_sb_info;
struct ext4_extent;
@@ -60,7 +70,10 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned long long status);
+ unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
@@ -68,36 +81,35 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
-extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
static inline int ext4_es_is_written(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0;
+ return (es->es_pblk & ES_WRITTEN) != 0;
}
static inline int ext4_es_is_unwritten(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0;
+ return (es->es_pblk & ES_UNWRITTEN) != 0;
}
static inline int ext4_es_is_delayed(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0;
+ return (es->es_pblk & ES_DELAYED) != 0;
}
static inline int ext4_es_is_hole(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_HOLE) != 0;
+ return (es->es_pblk & ES_HOLE) != 0;
}
-static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
+static inline unsigned int ext4_es_status(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_FLAGS);
+ return es->es_pblk >> ES_SHIFT;
}
static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
- return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+ return es->es_pblk & ~ES_MASK;
}
static inline void ext4_es_store_pblock(struct extent_status *es,
@@ -105,19 +117,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
{
ext4_fsblk_t block;
- block = (pb & ~EXTENT_STATUS_FLAGS) |
- (es->es_pblk & EXTENT_STATUS_FLAGS);
+ block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
es->es_pblk = block;
}
static inline void ext4_es_store_status(struct extent_status *es,
- unsigned long long status)
+ unsigned int status)
{
- ext4_fsblk_t block;
-
- block = (status & EXTENT_STATUS_FLAGS) |
- (es->es_pblk & ~EXTENT_STATUS_FLAGS);
- es->es_pblk = block;
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (es->es_pblk & ~ES_MASK));
}
extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6f4cc567c382..3da21945ff1f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
@@ -219,7 +219,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
{
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct ext4_inode_info *ei = EXT4_I(inode);
struct vfsmount *mnt = filp->f_path.mnt;
struct path path;
char buf[64], *cp;
@@ -259,22 +258,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
- if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
- struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
-
- spin_lock(&inode->i_lock);
- if (!ei->jinode) {
- if (!jinode) {
- spin_unlock(&inode->i_lock);
- return -ENOMEM;
- }
- ei->jinode = jinode;
- jbd2_journal_init_jbd_inode(ei->jinode, inode);
- jinode = NULL;
- }
- spin_unlock(&inode->i_lock);
- if (unlikely(jinode != NULL))
- jbd2_free_inode(jinode);
+ if (filp->f_mode & FMODE_WRITE) {
+ int ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ return ret;
}
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f03598c6ffd3..137193ff389b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -70,18 +70,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
+ struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
- ext4_free_group_clusters_set(sb, gdp, 0);
- ext4_free_inodes_set(sb, gdp, 0);
- ext4_itable_unused_set(sb, gdp, 0);
- memset(bh->b_data, 0xff, sb->s_blocksize);
- ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ grp = ext4_get_group_info(sb, block_group);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return 0;
}
@@ -117,6 +115,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
+ struct ext4_group_info *grp;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -185,6 +184,8 @@ verify:
put_bh(bh);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, bitmap_blk);
+ grp = ext4_get_group_info(sb, block_group);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return NULL;
}
ext4_unlock_group(sb, block_group);
@@ -221,6 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int fatal = 0, err, count, cleared;
+ struct ext4_group_info *grp;
if (!sb) {
printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
@@ -266,7 +268,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (!bitmap_bh)
+ /* Don't bother if the inode bitmap is corrupt. */
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
goto error_return;
BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -315,8 +319,10 @@ out:
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!fatal)
fatal = err;
- } else
+ } else {
ext4_error(sb, "bit already cleared for inode %lu", ino);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ }
error_return:
brelse(bitmap_bh);
@@ -625,6 +631,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
/*
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk. (Yes, these values are
+ * somewhat arbitrary...)
+ */
+#define RECENTCY_MIN 5
+#define RECENTCY_DIRTY 30
+
+static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+ struct ext4_group_desc *gdp;
+ struct ext4_inode *raw_inode;
+ struct buffer_head *bh;
+ unsigned long dtime, now;
+ int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int offset, ret = 0, recentcy = RECENTCY_MIN;
+
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (unlikely(!gdp))
+ return 0;
+
+ bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+ (ino / inodes_per_block));
+ if (unlikely(!bh) || !buffer_uptodate(bh))
+ /*
+ * If the block is not in the buffer cache, then it
+ * must have been written out.
+ */
+ goto out;
+
+ offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+ dtime = le32_to_cpu(raw_inode->i_dtime);
+ now = get_seconds();
+ if (buffer_dirty(bh))
+ recentcy += RECENTCY_DIRTY;
+
+ if (dtime && (dtime < now) && (now < dtime + recentcy))
+ ret = 1;
+out:
+ brelse(bh);
+ return ret;
+}
+
+/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -652,6 +703,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
+ struct ext4_group_info *grp;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -725,25 +777,39 @@ got_group:
continue;
}
+ grp = ext4_get_group_info(sb, group);
+ /* Skip groups with already-known suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
+
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
- if (!inode_bitmap_bh)
- goto out;
+ /* Skip groups with suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
inode_bitmap_bh->b_data,
EXT4_INODES_PER_GROUP(sb), ino);
- if (ino >= EXT4_INODES_PER_GROUP(sb)) {
- if (++group == ngroups)
- group = 0;
- continue;
- }
+ if (ino >= EXT4_INODES_PER_GROUP(sb))
+ goto next_group;
if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - "
"inode=%lu", ino + 1);
continue;
}
+ if ((EXT4_SB(sb)->s_journal == NULL) &&
+ recently_deleted(sb, group, ino)) {
+ ino++;
+ goto next_inode;
+ }
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
@@ -767,8 +833,12 @@ repeat_in_this_group:
ino++; /* the inode bitmap is zero-based */
if (!ret2)
goto got; /* we grabbed the inode! */
+next_inode:
if (ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
+next_group:
+ if (++group == ngroups)
+ group = 0;
}
err = -ENOSPC;
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 87b30cd357e7..594009f5f523 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -23,7 +23,6 @@
#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
-#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0188e65e1f58..c79fd7dabe79 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -465,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
if (es_map->m_lblk != map->m_lblk ||
es_map->m_flags != map->m_flags ||
es_map->m_pblk != map->m_pblk) {
- printk("ES cache assertation failed for inode: %lu "
+ printk("ES cache assertion failed for inode: %lu "
"es_cached ex [%d/%d/%llu/%x] != "
"found ex [%d/%d/%llu/%x] retval %d flags %x\n",
inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -514,10 +514,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
- ext4_es_lru_add(inode);
-
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lru_add(inode);
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -554,16 +553,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
}
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
-#ifdef ES_AGGRESSIVE_TEST
- if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
- "retval %d != map->m_len %d "
- "in %s (lookup)\n", inode->i_ino, retval,
- map->m_len, __func__);
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
}
-#endif
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -655,16 +653,15 @@ found:
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
-#ifdef ES_AGGRESSIVE_TEST
- if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
- "retval %d != map->m_len %d "
- "in %s (allocation)\n", inode->i_ino, retval,
- map->m_len, __func__);
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
}
-#endif
/*
* If the extent has been zeroed out, we don't need to update
@@ -730,8 +727,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
@@ -972,7 +973,8 @@ retry_journal:
ext4_journal_stop(handle);
goto retry_grab;
}
- wait_on_page_writeback(page);
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -1529,11 +1531,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
- ext4_es_lru_add(inode);
-
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, &es)) {
-
+ ext4_es_lru_add(inode);
if (ext4_es_is_hole(&es)) {
retval = 0;
down_read((&EXT4_I(inode)->i_data_sem));
@@ -1638,16 +1638,15 @@ add_delayed:
set_buffer_delay(bh);
} else if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
-#ifdef ES_AGGRESSIVE_TEST
- if (retval != map->m_len) {
- printk("ES len assertation failed for inode: %lu "
- "retval %d != map->m_len %d "
- "in %s (lookup)\n", inode->i_ino, retval,
- map->m_len, __func__);
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
}
-#endif
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -1896,12 +1895,32 @@ static int ext4_writepage(struct page *page,
return ret;
}
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
/*
* mballoc gives us at most this number of blocks...
* XXX: That seems to be only a limitation of ext4_mb_normalize_request().
- * The rest of mballoc seems to handle chunks upto full group size.
+ * The rest of mballoc seems to handle chunks up to full group size.
*/
#define MAX_WRITEPAGES_EXTENT_LEN 2048
@@ -1910,82 +1929,94 @@ static int ext4_writepage(struct page *page,
*
* @mpd - extent of blocks
* @lblk - logical number of the block in the file
- * @b_state - b_state of the buffer head added
+ * @bh - buffer head we want to add to the extent
*
- * the function is used to collect contig. blocks in same state
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
*/
-static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
- unsigned long b_state)
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ struct buffer_head *bh)
{
struct ext4_map_blocks *map = &mpd->map;
- /* Don't go larger than mballoc is willing to allocate */
- if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
- return 0;
+ /* Buffer that doesn't need mapping for writeback? */
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+ /* So far no extent to map => we write the buffer right away */
+ if (map->m_len == 0)
+ return true;
+ return false;
+ }
/* First block in the extent? */
if (map->m_len == 0) {
map->m_lblk = lblk;
map->m_len = 1;
- map->m_flags = b_state & BH_FLAGS;
- return 1;
+ map->m_flags = bh->b_state & BH_FLAGS;
+ return true;
}
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return false;
+
/* Can we merge the block to our big extent? */
if (lblk == map->m_lblk + map->m_len &&
- (b_state & BH_FLAGS) == map->m_flags) {
+ (bh->b_state & BH_FLAGS) == map->m_flags) {
map->m_len++;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
- struct buffer_head *head,
- struct buffer_head *bh,
- ext4_lblk_t lblk)
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
+ *
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
+ *
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
{
struct inode *inode = mpd->inode;
+ int err;
ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
>> inode->i_blkbits;
do {
BUG_ON(buffer_locked(bh));
- if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
- (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
- lblk >= blocks) {
+ if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
/* Found extent to map? */
if (mpd->map.m_len)
- return false;
- if (lblk >= blocks)
- return true;
- continue;
+ return 0;
+ /* Everything mapped so far and we hit EOF */
+ break;
}
- if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
- return false;
} while (lblk++, (bh = bh->b_this_page) != head);
- return true;
-}
-
-static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
-{
- int len;
- loff_t size = i_size_read(mpd->inode);
- int err;
-
- BUG_ON(page->index != mpd->first_page);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
- if (!err)
- mpd->wbc->nr_to_write--;
- mpd->first_page++;
-
- return err;
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, head->b_page);
+ if (err < 0)
+ return err;
+ }
+ return lblk < blocks;
}
/*
@@ -2009,8 +2040,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
struct inode *inode = mpd->inode;
struct buffer_head *head, *bh;
int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
- ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
- >> inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
sector_t pblock;
@@ -2032,7 +2061,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
if (page->index > end)
break;
- /* Upto 'end' pages must be contiguous */
+ /* Up to 'end' pages must be contiguous */
BUG_ON(page->index != start);
bh = head = page_buffers(page);
do {
@@ -2045,18 +2074,26 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
*/
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
- add_page_bufs_to_extent(mpd, head, bh,
- lblk);
+ /*
+ * FIXME: If dioread_nolock supports
+ * blocksize < pagesize, we need to make
+ * sure we add size mapped so far to
+ * io_end->size as the following call
+ * can submit the page for IO.
+ */
+ err = mpage_process_page_bufs(mpd, head,
+ bh, lblk);
pagevec_release(&pvec);
- return 0;
+ if (err > 0)
+ err = 0;
+ return err;
}
if (buffer_delay(bh)) {
clear_buffer_delay(bh);
bh->b_blocknr = pblock++;
}
clear_buffer_unwritten(bh);
- } while (++lblk < blocks &&
- (bh = bh->b_this_page) != head);
+ } while (lblk++, (bh = bh->b_this_page) != head);
/*
* FIXME: This is going to break if dioread_nolock
@@ -2163,7 +2200,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
mpd->io_submit.io_end->offset =
((loff_t)map->m_lblk) << inode->i_blkbits;
- while (map->m_len) {
+ do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
struct super_block *sb = inode->i_sb;
@@ -2201,16 +2238,14 @@ static int mpage_map_and_submit_extent(handle_t *handle,
err = mpage_map_and_submit_buffers(mpd);
if (err < 0)
return err;
- }
+ } while (map->m_len);
/* Update on-disk size after IO is submitted */
disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
- ext4_update_i_disksize(inode, disksize);
+ ext4_wb_update_i_disksize(inode, disksize);
err2 = ext4_mark_inode_dirty(handle, inode);
if (err2)
ext4_error(inode->i_sb,
@@ -2225,7 +2260,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
/*
* Calculate the total number of credits to reserve for one writepages
* iteration. This is called from ext4_writepages(). We map an extent of
- * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
* the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
* bpp - 1 blocks in bpp different extents.
*/
@@ -2325,14 +2360,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
lblk = ((ext4_lblk_t)page->index) <<
(PAGE_CACHE_SHIFT - blkbits);
head = page_buffers(page);
- if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+ err = mpage_process_page_bufs(mpd, head, head, lblk);
+ if (err <= 0)
goto out;
- /* So far everything mapped? Submit the page for IO. */
- if (mpd->map.m_len == 0) {
- err = mpage_submit_page(mpd, page);
- if (err < 0)
- goto out;
- }
+ err = 0;
/*
* Accumulated enough dirty pages? This doesn't apply
@@ -2416,7 +2447,7 @@ static int ext4_writepages(struct address_space *mapping,
if (ext4_should_dioread_nolock(inode)) {
/*
- * We may need to convert upto one extent per block in
+ * We may need to convert up to one extent per block in
* the page and we may dirty the inode.
*/
rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
@@ -2652,7 +2683,7 @@ retry_journal:
goto retry_grab;
}
/* In case writeback began while the page was unlocked */
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
if (ret < 0) {
@@ -2997,19 +3028,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private, int ret,
- bool is_async)
+ ssize_t size, void *private)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO just return */
- if (!io_end) {
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
+ if (!io_end)
return;
- }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3019,11 +3044,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
iocb->private = NULL;
io_end->offset = offset;
io_end->size = size;
- if (is_async) {
- io_end->iocb = iocb;
- io_end->result = ret;
- }
- ext4_put_io_end_defer(io_end);
+ ext4_put_io_end(io_end);
}
/*
@@ -3108,7 +3129,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ret = -ENOMEM;
goto retake_lock;
}
- io_end->flag |= EXT4_IO_END_DIRECT;
/*
* Grab reference for DIO. Will be dropped in ext4_end_io_dio()
*/
@@ -3153,13 +3173,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- WARN_ON(io_end->iocb);
- /*
- * Generic code already did inode_dio_done() so we
- * have to clear EXT4_IO_END_DIRECT to not do it for
- * the second time.
- */
- io_end->flag = 0;
ext4_put_io_end(io_end);
iocb->private = NULL;
}
@@ -3539,6 +3552,18 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ if (offset & (sb->s_blocksize - 1) ||
+ (offset + length) & (sb->s_blocksize - 1)) {
+ /*
+ * Attach jinode to inode for jbd2 if we do any zeroing of
+ * partial block
+ */
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ goto out_mutex;
+
+ }
+
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
@@ -3607,6 +3632,31 @@ out_mutex:
return ret;
}
+int ext4_inode_attach_jinode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct jbd2_inode *jinode;
+
+ if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
+ return 0;
+
+ jinode = jbd2_alloc_inode(GFP_KERNEL);
+ spin_lock(&inode->i_lock);
+ if (!ei->jinode) {
+ if (!jinode) {
+ spin_unlock(&inode->i_lock);
+ return -ENOMEM;
+ }
+ ei->jinode = jinode;
+ jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jinode = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+ if (unlikely(jinode != NULL))
+ jbd2_free_inode(jinode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -3667,6 +3717,12 @@ void ext4_truncate(struct inode *inode)
return;
}
+ /* If we zero-out tail of the page, we have to create jinode for jbd2 */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+ if (ext4_inode_attach_jinode(inode) < 0)
+ return;
+ }
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
else
@@ -4529,7 +4585,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ handle_t *handle;
+ loff_t oldsize = inode->i_size;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4537,73 +4595,69 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
- }
-
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE &&
- (attr->ia_size < inode->i_size)) {
- handle_t *handle;
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
- if (ext4_handle_valid(handle)) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- }
- EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
- ext4_journal_stop(handle);
-
- if (ext4_should_order_data(inode)) {
- error = ext4_begin_ordered_truncate(inode,
+ if (S_ISREG(inode->i_mode) &&
+ (attr->ia_size < inode->i_size)) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error) {
- /* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode,
- EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
+ if (error)
goto err_out;
- }
- ext4_orphan_del(handle, inode);
- orphan = 0;
- ext4_journal_stop(handle);
+ }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
goto err_out;
}
- }
- }
-
- if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != inode->i_size) {
- loff_t oldsize = inode->i_size;
-
- i_size_write(inode, attr->ia_size);
- /*
- * Blocks are going to be removed from the inode. Wait
- * for dio in flight. Temporarily disable
- * dioread_nolock to prevent livelock.
- */
- if (orphan) {
- if (!ext4_should_journal_data(inode)) {
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
- ext4_inode_resume_unlocked_dio(inode);
- } else
- ext4_wait_for_tail_page_commit(inode);
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
}
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
/*
- * Truncate pagecache after we've waited for commit
- * in data=journal mode to make pages freeable.
+ * We have to update i_size under i_data_sem together
+ * with i_disksize to avoid races with writeback code
+ * running ext4_wb_update_i_disksize().
*/
- truncate_pagecache(inode, oldsize, inode->i_size);
+ if (!error)
+ i_size_write(inode, attr->ia_size);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ if (error) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ } else
+ i_size_write(inode, attr->ia_size);
+
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
}
- ext4_truncate(inode);
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, oldsize, inode->i_size);
}
+ /*
+ * We want to call ext4_truncate() even if attr->ia_size ==
+ * inode->i_size for cases like truncation of fallocated space
+ */
+ if (attr->ia_valid & ATTR_SIZE)
+ ext4_truncate(inode);
if (!rc) {
setattr_copy(inode, attr);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9491ac0590f7..a569d335f804 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,7 +17,6 @@
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
-#include "ext4_extents.h"
#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
@@ -77,8 +76,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
- memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
- memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode1);
+ ext4_es_lru_del(inode2);
isize = i_size_read(inode1);
i_size_write(inode1, i_size_read(inode2));
@@ -622,6 +623,8 @@ resizefs_out:
return 0;
}
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ return ext4_ext_precache(inode);
default:
return -ENOTTY;
@@ -686,6 +689,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_MOVE_EXT:
case FITRIM:
case EXT4_IOC_RESIZE_FS:
+ case EXT4_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9ff5e5137ca..a41e3ba8cfaa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -751,13 +751,15 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u clusters in bitmap, %u in gd",
+ "%u clusters in bitmap, %u in gd; "
+ "block bitmap corrupt.",
free, grp->bb_free);
/*
- * If we intent to continue, we consider group descritor
+ * If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
mb_set_largest_free_order(sb, grp);
@@ -1398,6 +1400,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ /* Don't bother if the block group is corrupt. */
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return;
+
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1423,7 +1429,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
inode ? inode->i_ino : 0,
blocknr,
"freeing already freed block "
- "(bit %u)", block);
+ "(bit %u); block bitmap corrupt.",
+ block);
+ /* Mark the block group as corrupt. */
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &e4b->bd_info->bb_state);
mb_regenerate_buddy(e4b);
goto done;
}
@@ -1790,6 +1800,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
if (err)
return err;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+ ext4_mb_unload_buddy(e4b);
+ return 0;
+ }
+
ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
@@ -1987,6 +2002,9 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
if (cr <= 2 && free < ac->ac_g_ex.fe_len)
return 0;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return 0;
+
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -4585,6 +4603,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_buddy e4b;
unsigned int count_clusters;
int err = 0;
@@ -4673,6 +4692,10 @@ do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+ ext4_get_group_info(sb, block_group))))
+ return;
+
/*
* Check to see if we are freeing blocks across a group
* boundary.
@@ -4740,11 +4763,16 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
+ retry:
new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
if (!new_entry) {
- ext4_mb_unload_buddy(&e4b);
- err = -ENOMEM;
- goto error_return;
+ /*
+ * We use a retry loop because
+ * ext4_free_blocks() is not allowed to fail.
+ */
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
}
new_entry->efd_start_cluster = bit;
new_entry->efd_group = block_group;
@@ -4779,7 +4807,6 @@ do_more:
ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
@@ -4787,10 +4814,23 @@ do_more:
&sbi->s_flex_groups[flex_group].free_clusters);
}
- ext4_mb_unload_buddy(&e4b);
-
- if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter,
+ count_clusters);
+ spin_lock(&ei->i_block_reservation_lock);
+ if (flags & EXT4_FREE_BLOCKS_METADATA)
+ ei->i_reserved_meta_blocks += count_clusters;
+ else
+ ei->i_reserved_data_blocks += count_clusters;
+ spin_unlock(&ei->i_block_reservation_lock);
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_reclaim_block(inode,
+ EXT4_C2B(sbi, count_clusters));
+ } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+
+ ext4_mb_unload_buddy(&e4b);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 49e8bdff9163..2ae73a80c19b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+ path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
@@ -494,7 +494,7 @@ int ext4_ext_migrate(struct inode *inode)
* superblock modification.
*
* For the tmp_inode we already have committed the
- * trascation that created the inode. Later as and
+ * transaction that created the inode. Later as and
* when we add extents we extent the journal
*/
/*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e86dddbd8296..7fa4d855dbd5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
int ret = 0;
struct ext4_ext_path *path;
- path = ext4_ext_find_extent(inode, lblock, *orig_path);
+ path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
if (IS_ERR(path))
ret = PTR_ERR(path);
else if (path[ext_depth(inode)].p_ext == NULL)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 234b834d5a97..1bec5a5c1e45 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2316,11 +2316,11 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
+ d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
if (err)
goto err_drop_inode;
mark_inode_dirty(inode);
- d_tmpfile(dentry, inode);
unlock_new_inode(inode);
}
if (handle)
@@ -3005,15 +3005,19 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
+ *
+ * n.b. old_{dentry,inode) refers to the source dentry/inode
+ * while new_{dentry,inode) refers to the destination dentry/inode
+ * This comes from rename(const char *oldpath, const char *newpath)
*/
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- handle_t *handle;
+ handle_t *handle = NULL;
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
- int retval, force_da_alloc = 0;
+ int retval;
int inlined = 0, new_inlined = 0;
struct ext4_dir_entry_2 *parent_de;
@@ -3026,14 +3030,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* in separate transaction */
if (new_dentry->d_inode)
dquot_initialize(new_dentry->d_inode);
- handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- ext4_handle_sync(handle);
old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
/*
@@ -3056,6 +3052,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
new_bh = NULL;
}
}
+ if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_alloc_da_blocks(old_inode);
+
+ handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ ext4_handle_sync(handle);
+
if (S_ISDIR(old_inode->i_mode)) {
if (new_inode) {
retval = -ENOTEMPTY;
@@ -3186,8 +3194,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_mark_inode_dirty(handle, new_inode);
if (!new_inode->i_nlink)
ext4_orphan_add(handle, new_inode);
- if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
- force_da_alloc = 1;
}
retval = 0;
@@ -3195,9 +3201,8 @@ end_rename:
brelse(dir_bh);
brelse(old_bh);
brelse(new_bh);
- ext4_journal_stop(handle);
- if (retval == 0 && force_da_alloc)
- ext4_alloc_da_blocks(old_inode);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 48786cdb5e6c..d7d0c7b46ed4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/ratelimit.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -55,7 +56,7 @@ void ext4_exit_pageio(void)
static void buffer_io_error(struct buffer_head *bh)
{
char b[BDEVNAME_SIZE];
- printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
bdevname(bh->b_bdev, b),
(unsigned long long)bh->b_blocknr);
}
@@ -122,10 +123,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
- if (io_end->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(io_end->inode);
- if (io_end->iocb)
- aio_complete(io_end->iocb, io_end->result, 0);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -203,19 +200,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ WARN_ON(!io_end->handle);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io_end->handle) {
- wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
- if (list_empty(&ei->i_rsv_conversion_list))
- queue_work(wq, &ei->i_rsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
- } else {
- wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
- if (list_empty(&ei->i_unrsv_conversion_list))
- queue_work(wq, &ei->i_unrsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
- }
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
@@ -255,13 +247,6 @@ void ext4_end_io_rsv_work(struct work_struct *work)
ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-void ext4_end_io_unrsv_work(struct work_struct *work)
-{
- struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unrsv_conversion_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
-}
-
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
@@ -308,6 +293,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
return io_end;
}
+/* BIO completion function for page writeback */
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
@@ -318,18 +304,6 @@ static void ext4_end_bio(struct bio *bio, int error)
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- /*
- * Link bio into list hanging from io_end. We have to do it
- * atomically as bio completions can be racing against each
- * other.
- */
- bio->bi_private = xchg(&io_end->bio, bio);
- } else {
- ext4_finish_bio(bio);
- bio_put(bio);
- }
-
if (error) {
struct inode *inode = io_end->inode;
@@ -341,7 +315,24 @@ static void ext4_end_bio(struct bio *bio, int error)
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
}
- ext4_put_io_end_defer(io_end);
+
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ /*
+ * Link bio into list hanging from io_end. We have to do it
+ * atomically as bio completions can be racing against each
+ * other.
+ */
+ bio->bi_private = xchg(&io_end->bio, bio);
+ ext4_put_io_end_defer(io_end);
+ } else {
+ /*
+ * Drop io_end reference early. Inode can get freed once
+ * we finish the bio.
+ */
+ ext4_put_io_end_defer(io_end);
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ }
}
void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 85b3dd60169b..2c2e6cbc6bed 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -162,7 +162,7 @@ void *ext4_kvmalloc(size_t size, gfp_t flags)
{
void *ret;
- ret = kmalloc(size, flags);
+ ret = kmalloc(size, flags | __GFP_NOWARN);
if (!ret)
ret = __vmalloc(size, flags, PAGE_KERNEL);
return ret;
@@ -172,7 +172,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
{
void *ret;
- ret = kzalloc(size, flags);
+ ret = kzalloc(size, flags | __GFP_NOWARN);
if (!ret)
ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
return ret;
@@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->unrsv_conversion_wq);
flush_workqueue(sbi->rsv_conversion_wq);
- destroy_workqueue(sbi->unrsv_conversion_wq);
destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
@@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
- INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
- INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -1134,8 +1130,8 @@ enum {
Opt_nouid32, Opt_debug, Opt_removed,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
- Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
- Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
+ Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
+ Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -1179,6 +1175,7 @@ static const match_table_t tokens = {
{Opt_min_batch_time, "min_batch_time=%u"},
{Opt_max_batch_time, "max_batch_time=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_journal_checksum, "journal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_abort, "abort"},
@@ -1338,6 +1335,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_NO_EXT2 0x0100
#define MOPT_NO_EXT3 0x0200
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
+#define MOPT_STRING 0x0400
static const struct mount_opts {
int token;
@@ -1359,7 +1357,7 @@ static const struct mount_opts {
{Opt_delalloc, EXT4_MOUNT_DELALLOC,
MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
- MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT},
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_SET},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1387,6 +1385,7 @@ static const struct mount_opts {
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
{Opt_journal_dev, 0, MOPT_GTE0},
+ {Opt_journal_path, 0, MOPT_STRING},
{Opt_journal_ioprio, 0, MOPT_GTE0},
{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
@@ -1480,7 +1479,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
- if (args->from && match_int(args, &arg))
+ if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
return -1;
if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
return -1;
@@ -1544,6 +1543,44 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
*journal_devnum = arg;
+ } else if (token == Opt_journal_path) {
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext4_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return -1;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext4_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return -1;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext4_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return -1;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
} else if (token == Opt_journal_ioprio) {
if (arg > 7) {
ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
@@ -1702,12 +1739,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
if (sbi->s_qf_names[GRPQUOTA])
seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
- if (test_opt(sb, USRQUOTA))
- seq_puts(seq, ",usrquota");
-
- if (test_opt(sb, GRPQUOTA))
- seq_puts(seq, ",grpquota");
#endif
}
@@ -3489,7 +3520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (test_opt(sb, DIOREAD_NOLOCK)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and delalloc");
+ "both data=journal and dioread_nolock");
goto failed_mount;
}
if (test_opt(sb, DELALLOC))
@@ -3624,10 +3655,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
- /* Do we have standard group size of blocksize * 8 blocks ? */
- if (sbi->s_blocks_per_group == blocksize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
-
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3697,6 +3724,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
+
/*
* Test whether we have more sectors than will fit in sector_t,
* and whether the max offset is addressable by the page cache.
@@ -3960,14 +3991,6 @@ no_journal:
goto failed_mount4;
}
- EXT4_SB(sb)->unrsv_conversion_wq =
- alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->unrsv_conversion_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
- ret = -ENOMEM;
- goto failed_mount4;
- }
-
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -4121,8 +4144,6 @@ failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- if (EXT4_SB(sb)->unrsv_conversion_wq)
- destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -4570,7 +4591,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
- flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
@@ -4606,7 +4626,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
dquot_writeback_dquots(sb, -1);
if (wait && test_opt(sb, BARRIER))
ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
@@ -4733,6 +4752,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dioread_nolock");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, "Abort forced by user");
@@ -5487,6 +5521,7 @@ static void __exit ext4_exit_fs(void)
kset_unregister(ext4_kset);
ext4_exit_system_zone();
ext4_exit_pageio();
+ ext4_exit_es();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 66a6b85a51d8..bb312201ca95 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -182,7 +182,7 @@ const struct address_space_operations f2fs_meta_aops = {
.set_page_dirty = f2fs_set_meta_page_dirty,
};
-int check_orphan_space(struct f2fs_sb_info *sbi)
+int acquire_orphan_inode(struct f2fs_sb_info *sbi)
{
unsigned int max_orphans;
int err = 0;
@@ -197,10 +197,19 @@ int check_orphan_space(struct f2fs_sb_info *sbi)
mutex_lock(&sbi->orphan_inode_mutex);
if (sbi->n_orphans >= max_orphans)
err = -ENOSPC;
+ else
+ sbi->n_orphans++;
mutex_unlock(&sbi->orphan_inode_mutex);
return err;
}
+void release_orphan_inode(struct f2fs_sb_info *sbi)
+{
+ mutex_lock(&sbi->orphan_inode_mutex);
+ sbi->n_orphans--;
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct list_head *head, *this;
@@ -229,21 +238,18 @@ retry:
list_add(&new->list, this->prev);
else
list_add_tail(&new->list, head);
-
- sbi->n_orphans++;
out:
mutex_unlock(&sbi->orphan_inode_mutex);
}
void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct list_head *this, *next, *head;
+ struct list_head *head;
struct orphan_inode_entry *orphan;
mutex_lock(&sbi->orphan_inode_mutex);
head = &sbi->orphan_inode_list;
- list_for_each_safe(this, next, head) {
- orphan = list_entry(this, struct orphan_inode_entry, list);
+ list_for_each_entry(orphan, head, list) {
if (orphan->ino == ino) {
list_del(&orphan->list);
kmem_cache_free(orphan_entry_slab, orphan);
@@ -373,7 +379,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp1;
- pre_version = le64_to_cpu(cp_block->checkpoint_ver);
+ pre_version = cur_cp_version(cp_block);
/* Read the 2nd cp block in this CP pack */
cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
@@ -388,7 +394,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp2;
- cur_version = le64_to_cpu(cp_block->checkpoint_ver);
+ cur_version = cur_cp_version(cp_block);
if (cur_version == pre_version) {
*version = cur_version;
@@ -793,7 +799,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
* Increase the version number so that
* SIT entries and seg summaries are written at correct place
*/
- ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver);
+ ckpt_ver = cur_cp_version(ckpt);
ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
/* write cached NAT/SIT entries to NAT/SIT area */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 035f9a345cdf..941f9b9ca3a5 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -37,9 +37,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
struct page *node_page = dn->node_page;
unsigned int ofs_in_node = dn->ofs_in_node;
- wait_on_page_writeback(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, false);
- rn = (struct f2fs_node *)page_address(node_page);
+ rn = F2FS_NODE(node_page);
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
@@ -117,7 +117,8 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
block_t start_blkaddr, end_blkaddr;
BUG_ON(blk_addr == NEW_ADDR);
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node;
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ dn->ofs_in_node;
/* Update the page address in the parent node */
__set_data_blkaddr(dn, blk_addr);
@@ -176,7 +177,6 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
end_update:
write_unlock(&fi->ext.ext_lock);
sync_inode_page(dn);
- return;
}
struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@ -260,8 +260,17 @@ repeat:
if (PageUptodate(page))
return page;
- BUG_ON(dn.data_blkaddr == NEW_ADDR);
- BUG_ON(dn.data_blkaddr == NULL_ADDR);
+ /*
+ * A new dentry page is allocated but not able to be written, since its
+ * new inode page couldn't be allocated due to -ENOSPC.
+ * In such the case, its blkaddr can be remained as NEW_ADDR.
+ * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
+ */
+ if (dn.data_blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ return page;
+ }
err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
if (err)
@@ -365,7 +374,6 @@ static void read_end_io(struct bio *bio, int err)
}
unlock_page(page);
} while (bvec >= bio->bi_io_vec);
- kfree(bio->bi_private);
bio_put(bio);
}
@@ -391,7 +399,6 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
bio->bi_end_io = read_end_io;
if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
- kfree(bio->bi_private);
bio_put(bio);
up_read(&sbi->bio_sem);
f2fs_put_page(page, 1);
@@ -442,7 +449,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
unsigned int end_offset;
end_offset = IS_INODE(dn.node_page) ?
- ADDRS_PER_INODE :
+ ADDRS_PER_INODE(F2FS_I(inode)) :
ADDRS_PER_BLOCK;
clear_buffer_new(bh_result);
@@ -636,9 +643,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
int err = 0;
int ilock;
- /* for nobh_write_end */
- *fsdata = NULL;
-
f2fs_balance_fs(sbi);
repeat:
page = grab_cache_page_write_begin(mapping, index, flags);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 0d6c6aafb235..a84b0a8e6854 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -29,7 +29,7 @@ static DEFINE_MUTEX(f2fs_stat_mutex);
static void update_general_status(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
int i;
/* valid check of the segment numbers */
@@ -83,7 +83,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
*/
static void update_sit_info(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
struct sit_info *sit_i = SIT_I(sbi);
unsigned int segno, vblocks;
@@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
*/
static void update_mem_info(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned npages;
if (si->base_mem)
@@ -253,21 +253,21 @@ static int stat_show(struct seq_file *s, void *v)
si->nats, NM_WOUT_THRESHOLD);
seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
si->sits, si->fnids);
- seq_printf(s, "\nDistribution of User Blocks:");
- seq_printf(s, " [ valid | invalid | free ]\n");
- seq_printf(s, " [");
+ seq_puts(s, "\nDistribution of User Blocks:");
+ seq_puts(s, " [ valid | invalid | free ]\n");
+ seq_puts(s, " [");
for (j = 0; j < si->util_valid; j++)
- seq_printf(s, "-");
- seq_printf(s, "|");
+ seq_putc(s, '-');
+ seq_putc(s, '|');
for (j = 0; j < si->util_invalid; j++)
- seq_printf(s, "-");
- seq_printf(s, "|");
+ seq_putc(s, '-');
+ seq_putc(s, '|');
for (j = 0; j < si->util_free; j++)
- seq_printf(s, "-");
- seq_printf(s, "]\n\n");
+ seq_putc(s, '-');
+ seq_puts(s, "]\n\n");
seq_printf(s, "SSR: %u blocks in %u segments\n",
si->block_count[SSR], si->segment_count[SSR]);
seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -305,11 +305,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
- sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
- if (!sbi->stat_info)
+ si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+ if (!si)
return -ENOMEM;
- si = sbi->stat_info;
si->all_area_segs = le32_to_cpu(raw_super->segment_count);
si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
@@ -319,6 +318,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->main_area_zones = si->main_area_sections /
le32_to_cpu(raw_super->secs_per_zone);
si->sbi = sbi;
+ sbi->stat_info = si;
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
@@ -329,13 +329,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
- struct f2fs_stat_info *si = sbi->stat_info;
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
mutex_lock(&f2fs_stat_mutex);
list_del(&si->stat_list);
mutex_unlock(&f2fs_stat_mutex);
- kfree(sbi->stat_info);
+ kfree(si);
}
void __init f2fs_create_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 62f0d5977c64..384c6daf9a89 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -270,12 +270,27 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
struct f2fs_node *rn;
/* copy name info. to this inode page */
- rn = (struct f2fs_node *)page_address(ipage);
+ rn = F2FS_NODE(ipage);
rn->i.i_namelen = cpu_to_le32(name->len);
memcpy(rn->i.i_name, name->name, name->len);
set_page_dirty(ipage);
}
+int update_dent_inode(struct inode *inode, const struct qstr *name)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+
+ page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ init_dent_inode(name, page);
+ f2fs_put_page(page, 1);
+
+ return 0;
+}
+
static int make_empty_dir(struct inode *inode,
struct inode *parent, struct page *page)
{
@@ -557,6 +572,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (inode->i_nlink == 0)
add_orphan_inode(sbi, inode->i_ino);
+ else
+ release_orphan_inode(sbi);
}
if (bit_pos == NR_DENTRY_IN_BLOCK) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 467d42d65c48..608f0df5b919 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/crc32.h>
#include <linux/magic.h>
+#include <linux/kobject.h>
/*
* For mount options
@@ -28,6 +29,7 @@
#define F2FS_MOUNT_XATTR_USER 0x00000010
#define F2FS_MOUNT_POSIX_ACL 0x00000020
#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
+#define F2FS_MOUNT_INLINE_XATTR 0x00000080
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -134,11 +136,13 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
/*
* For INODE and NODE manager
*/
-#define XATTR_NODE_OFFSET (-1) /*
- * store xattrs to one node block per
- * file keeping -1 as its node offset to
- * distinguish from index node blocks.
- */
+/*
+ * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1
+ * as its node offset to distinguish from index node blocks.
+ * But some bits are used to mark the node block.
+ */
+#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \
+ >> OFFSET_BIT_SHIFT)
enum {
ALLOC_NODE, /* allocate a new node page if needed */
LOOKUP_NODE, /* look up a node without readahead */
@@ -178,6 +182,7 @@ struct f2fs_inode_info {
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
+ unsigned long long xattr_ver; /* cp version of xattr modification */
struct extent_info ext; /* in-memory extent cache entry */
};
@@ -296,15 +301,6 @@ struct f2fs_sm_info {
};
/*
- * For directory operation
- */
-#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1)
-#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2)
-#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3)
-#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4)
-#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5)
-
-/*
* For superblock
*/
/*
@@ -350,6 +346,7 @@ enum page_type {
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
+ struct proc_dir_entry *s_proc; /* proc entry */
struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
int s_dirty; /* dirty flag for checkpoint */
@@ -429,6 +426,10 @@ struct f2fs_sb_info {
#endif
unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
+
+ /* For sysfs suppport */
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
};
/*
@@ -454,6 +455,11 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
return (struct f2fs_checkpoint *)(sbi->ckpt);
}
+static inline struct f2fs_node *F2FS_NODE(struct page *page)
+{
+ return (struct f2fs_node *)page_address(page);
+}
+
static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
{
return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -489,6 +495,11 @@ static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
sbi->s_dirty = 0;
}
+static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
+{
+ return le64_to_cpu(cp->checkpoint_ver);
+}
+
static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
{
unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
@@ -677,7 +688,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
{
block_t start_addr;
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver);
+ unsigned long long ckpt_version = cur_cp_version(ckpt);
start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
@@ -812,7 +823,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
static inline bool IS_INODE(struct page *page)
{
- struct f2fs_node *p = (struct f2fs_node *)page_address(page);
+ struct f2fs_node *p = F2FS_NODE(page);
return RAW_IS_INODE(p);
}
@@ -826,7 +837,7 @@ static inline block_t datablock_addr(struct page *node_page,
{
struct f2fs_node *raw_node;
__le32 *addr_array;
- raw_node = (struct f2fs_node *)page_address(node_page);
+ raw_node = F2FS_NODE(node_page);
addr_array = blkaddr_in_node(raw_node);
return le32_to_cpu(addr_array[offset]);
}
@@ -873,6 +884,7 @@ enum {
FI_NO_ALLOC, /* should not allocate any blocks */
FI_UPDATE_DIR, /* should update inode block for consistency */
FI_DELAY_IPUT, /* used for the recovery */
+ FI_INLINE_XATTR, /* used for inline xattr */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -905,6 +917,45 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
return 0;
}
+static inline void get_inline_info(struct f2fs_inode_info *fi,
+ struct f2fs_inode *ri)
+{
+ if (ri->i_inline & F2FS_INLINE_XATTR)
+ set_inode_flag(fi, FI_INLINE_XATTR);
+}
+
+static inline void set_raw_inline(struct f2fs_inode_info *fi,
+ struct f2fs_inode *ri)
+{
+ ri->i_inline = 0;
+
+ if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+ ri->i_inline |= F2FS_INLINE_XATTR;
+}
+
+static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
+{
+ if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+ return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
+ return DEF_ADDRS_PER_INODE;
+}
+
+static inline void *inline_xattr_addr(struct page *page)
+{
+ struct f2fs_inode *ri;
+ ri = (struct f2fs_inode *)page_address(page);
+ return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
+ F2FS_INLINE_XATTR_ADDRS]);
+}
+
+static inline int inline_xattr_size(struct inode *inode)
+{
+ if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR))
+ return F2FS_INLINE_XATTR_ADDRS << 2;
+ else
+ return 0;
+}
+
static inline int f2fs_readonly(struct super_block *sb)
{
return sb->s_flags & MS_RDONLY;
@@ -947,6 +998,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
+int update_dent_inode(struct inode *, const struct qstr *);
int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
int f2fs_make_empty(struct inode *, struct inode *);
@@ -980,6 +1032,7 @@ int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
+int truncate_xattr_node(struct inode *, struct page *);
int remove_inode_page(struct inode *);
struct page *new_inode_page(struct inode *, const struct qstr *);
struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
@@ -1012,7 +1065,8 @@ int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
struct bio *f2fs_bio_alloc(struct block_device *, int);
-void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
+void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
block_t, block_t *);
@@ -1037,7 +1091,8 @@ void destroy_segment_manager(struct f2fs_sb_info *);
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-int check_orphan_space(struct f2fs_sb_info *);
+int acquire_orphan_inode(struct f2fs_sb_info *);
+void release_orphan_inode(struct f2fs_sb_info *);
void add_orphan_inode(struct f2fs_sb_info *, nid_t);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
int recover_orphan_inodes(struct f2fs_sb_info *);
@@ -1068,7 +1123,7 @@ int do_write_data_page(struct page *);
*/
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int);
+block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
int f2fs_gc(struct f2fs_sb_info *);
void build_gc_manager(struct f2fs_sb_info *);
int __init create_gc_caches(void);
@@ -1112,11 +1167,16 @@ struct f2fs_stat_info {
unsigned base_mem, cache_mem;
};
+static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_stat_info*)sbi->stat_info;
+}
+
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_seg_count(sbi, type) \
do { \
- struct f2fs_stat_info *si = sbi->stat_info; \
+ struct f2fs_stat_info *si = F2FS_STAT(sbi); \
(si)->tot_segs++; \
if (type == SUM_TYPE_DATA) \
si->data_segs++; \
@@ -1129,14 +1189,14 @@ struct f2fs_stat_info {
#define stat_inc_data_blk_count(sbi, blks) \
do { \
- struct f2fs_stat_info *si = sbi->stat_info; \
+ struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->data_blks += (blks); \
} while (0)
#define stat_inc_node_blk_count(sbi, blks) \
do { \
- struct f2fs_stat_info *si = sbi->stat_info; \
+ struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->node_blks += (blks); \
} while (0)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d2d2b7dbdcc1..02c906971cc6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -112,11 +112,13 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- inode = igrab(dentry->d_parent->d_inode);
- dput(dentry);
+ if (update_dent_inode(inode, &dentry->d_name)) {
+ dput(dentry);
+ return 0;
+ }
- *pino = inode->i_ino;
- iput(inode);
+ *pino = parent_ino(dentry);
+ dput(dentry);
return 1;
}
@@ -147,9 +149,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_lock(&inode->i_mutex);
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
-
+ /*
+ * Both of fdatasync() and fsync() are able to be recovered from
+ * sudden-power-off.
+ */
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
else if (file_wrong_pino(inode))
@@ -158,10 +161,14 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
need_cp = true;
else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
need_cp = true;
+ else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+ need_cp = true;
if (need_cp) {
nid_t pino;
+ F2FS_I(inode)->xattr_ver = 0;
+
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
@@ -205,7 +212,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
struct f2fs_node *raw_node;
__le32 *addr;
- raw_node = page_address(dn->node_page);
+ raw_node = F2FS_NODE(dn->node_page);
addr = blkaddr_in_node(raw_node) + ofs;
for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
@@ -283,7 +290,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
}
if (IS_INODE(dn.node_page))
- count = ADDRS_PER_INODE;
+ count = ADDRS_PER_INODE(F2FS_I(inode));
else
count = ADDRS_PER_BLOCK;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 35f9b1a196aa..2f157e883687 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -29,10 +29,11 @@ static struct kmem_cache *winode_slab;
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
+ struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
long wait_ms;
- wait_ms = GC_THREAD_MIN_SLEEP_TIME;
+ wait_ms = gc_th->min_sleep_time;
do {
if (try_to_freeze())
@@ -45,7 +46,7 @@ static int gc_thread_func(void *data)
break;
if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
- wait_ms = GC_THREAD_MAX_SLEEP_TIME;
+ wait_ms = increase_sleep_time(gc_th, wait_ms);
continue;
}
@@ -66,15 +67,15 @@ static int gc_thread_func(void *data)
continue;
if (!is_idle(sbi)) {
- wait_ms = increase_sleep_time(wait_ms);
+ wait_ms = increase_sleep_time(gc_th, wait_ms);
mutex_unlock(&sbi->gc_mutex);
continue;
}
if (has_enough_invalid_blocks(sbi))
- wait_ms = decrease_sleep_time(wait_ms);
+ wait_ms = decrease_sleep_time(gc_th, wait_ms);
else
- wait_ms = increase_sleep_time(wait_ms);
+ wait_ms = increase_sleep_time(gc_th, wait_ms);
#ifdef CONFIG_F2FS_STAT_FS
sbi->bg_gc++;
@@ -82,7 +83,7 @@ static int gc_thread_func(void *data)
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi))
- wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
+ wait_ms = gc_th->no_gc_sleep_time;
} while (!kthread_should_stop());
return 0;
}
@@ -101,6 +102,12 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
goto out;
}
+ gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
+ gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
+ gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
+
+ gc_th->gc_idle = 0;
+
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
@@ -125,9 +132,17 @@ void stop_gc_thread(struct f2fs_sb_info *sbi)
sbi->gc_thread = NULL;
}
-static int select_gc_type(int gc_type)
+static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
{
- return (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+ int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+
+ if (gc_th && gc_th->gc_idle) {
+ if (gc_th->gc_idle == 1)
+ gc_mode = GC_CB;
+ else if (gc_th->gc_idle == 2)
+ gc_mode = GC_GREEDY;
+ }
+ return gc_mode;
}
static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
@@ -138,12 +153,18 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
if (p->alloc_mode == SSR) {
p->gc_mode = GC_GREEDY;
p->dirty_segmap = dirty_i->dirty_segmap[type];
+ p->max_search = dirty_i->nr_dirty[type];
p->ofs_unit = 1;
} else {
- p->gc_mode = select_gc_type(gc_type);
+ p->gc_mode = select_gc_type(sbi->gc_thread, gc_type);
p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
+ p->max_search = dirty_i->nr_dirty[DIRTY];
p->ofs_unit = sbi->segs_per_sec;
}
+
+ if (p->max_search > MAX_VICTIM_SEARCH)
+ p->max_search = MAX_VICTIM_SEARCH;
+
p->offset = sbi->last_victim[p->gc_mode];
}
@@ -290,7 +311,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
if (cost == max_cost)
continue;
- if (nsearched++ >= MAX_VICTIM_SEARCH) {
+ if (nsearched++ >= p.max_search) {
sbi->last_victim[p.gc_mode] = segno;
break;
}
@@ -407,8 +428,7 @@ next_step:
/* set page dirty and write it */
if (gc_type == FG_GC) {
- f2fs_submit_bio(sbi, NODE, true);
- wait_on_page_writeback(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
set_page_dirty(node_page);
} else {
if (!PageWriteback(node_page))
@@ -447,7 +467,7 @@ next_step:
* as indirect or double indirect node blocks, are given, it must be a caller's
* bug.
*/
-block_t start_bidx_of_node(unsigned int node_ofs)
+block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
{
unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
unsigned int bidx;
@@ -464,7 +484,7 @@ block_t start_bidx_of_node(unsigned int node_ofs)
int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 5 - dec;
}
- return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
+ return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
}
static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -508,10 +528,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
} else {
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- if (PageWriteback(page)) {
- f2fs_submit_bio(sbi, DATA, true);
- wait_on_page_writeback(page);
- }
+ f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page) &&
S_ISDIR(inode->i_mode)) {
@@ -575,7 +592,6 @@ next_step:
continue;
}
- start_bidx = start_bidx_of_node(nofs);
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 2) {
@@ -583,6 +599,8 @@ next_step:
if (IS_ERR(inode))
continue;
+ start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+
data_page = find_data_page(inode,
start_bidx + ofs_in_node, false);
if (IS_ERR(data_page))
@@ -593,6 +611,8 @@ next_step:
} else {
inode = find_gc_inode(dni.ino, ilist);
if (inode) {
+ start_bidx = start_bidx_of_node(nofs,
+ F2FS_I(inode));
data_page = get_lock_data_page(inode,
start_bidx + ofs_in_node);
if (IS_ERR(data_page))
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 2c6a6bd08322..507056d22205 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -13,18 +13,26 @@
* whether IO subsystem is idle
* or not
*/
-#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
-#define GC_THREAD_MAX_SLEEP_TIME 60000
-#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
+#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
+#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
+#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
/* Search max. number of dirty segments to select a victim segment */
-#define MAX_VICTIM_SEARCH 20
+#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */
struct f2fs_gc_kthread {
struct task_struct *f2fs_gc_task;
wait_queue_head_t gc_wait_queue_head;
+
+ /* for gc sleep time */
+ unsigned int min_sleep_time;
+ unsigned int max_sleep_time;
+ unsigned int no_gc_sleep_time;
+
+ /* for changing gc mode */
+ unsigned int gc_idle;
};
struct inode_entry {
@@ -56,25 +64,25 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
}
-static inline long increase_sleep_time(long wait)
+static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
{
- if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+ if (wait == gc_th->no_gc_sleep_time)
return wait;
- wait += GC_THREAD_MIN_SLEEP_TIME;
- if (wait > GC_THREAD_MAX_SLEEP_TIME)
- wait = GC_THREAD_MAX_SLEEP_TIME;
+ wait += gc_th->min_sleep_time;
+ if (wait > gc_th->max_sleep_time)
+ wait = gc_th->max_sleep_time;
return wait;
}
-static inline long decrease_sleep_time(long wait)
+static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
{
- if (wait == GC_THREAD_NOGC_SLEEP_TIME)
- wait = GC_THREAD_MAX_SLEEP_TIME;
+ if (wait == gc_th->no_gc_sleep_time)
+ wait = gc_th->max_sleep_time;
- wait -= GC_THREAD_MIN_SLEEP_TIME;
- if (wait <= GC_THREAD_MIN_SLEEP_TIME)
- wait = GC_THREAD_MIN_SLEEP_TIME;
+ wait -= gc_th->min_sleep_time;
+ if (wait <= gc_th->min_sleep_time)
+ wait = gc_th->min_sleep_time;
return wait;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2b2d45d19e3e..9339cd292047 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -56,7 +56,7 @@ static int do_read_inode(struct inode *inode)
if (IS_ERR(node_page))
return PTR_ERR(node_page);
- rn = page_address(node_page);
+ rn = F2FS_NODE(node_page);
ri = &(rn->i);
inode->i_mode = le16_to_cpu(ri->i_mode);
@@ -85,6 +85,7 @@ static int do_read_inode(struct inode *inode)
fi->i_advise = ri->i_advise;
fi->i_pino = le32_to_cpu(ri->i_pino);
get_extent_info(&fi->ext, ri->i_ext);
+ get_inline_info(fi, ri);
f2fs_put_page(node_page, 1);
return 0;
}
@@ -151,9 +152,9 @@ void update_inode(struct inode *inode, struct page *node_page)
struct f2fs_node *rn;
struct f2fs_inode *ri;
- wait_on_page_writeback(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, false);
- rn = page_address(node_page);
+ rn = F2FS_NODE(node_page);
ri = &(rn->i);
ri->i_mode = cpu_to_le16(inode->i_mode);
@@ -164,6 +165,7 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(inode->i_blocks);
set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+ set_raw_inline(F2FS_I(inode), ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -221,9 +223,6 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
return 0;
- if (wbc)
- f2fs_balance_fs(sbi);
-
/*
* We need to lock here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
@@ -231,6 +230,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
ilock = mutex_lock_op(sbi);
ret = update_inode_page(inode);
mutex_unlock_op(sbi, ilock);
+
+ if (wbc)
+ f2fs_balance_fs(sbi);
+
return ret;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 64c07169df05..2a5359c990fc 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -83,21 +83,11 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
{
size_t slen = strlen(s);
size_t sublen = strlen(sub);
- int ret;
if (sublen > slen)
return 0;
- ret = memcmp(s + slen - sublen, sub, sublen);
- if (ret) { /* compare upper case */
- int i;
- char upper_sub[8];
- for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
- upper_sub[i] = toupper(sub[i]);
- return !memcmp(s + slen - sublen, upper_sub, sublen);
- }
-
- return !ret;
+ return !strncasecmp(s + slen - sublen, sub, sublen);
}
/*
@@ -239,7 +229,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (!de)
goto fail;
- err = check_orphan_space(sbi);
+ err = acquire_orphan_inode(sbi);
if (err) {
kunmap(page);
f2fs_put_page(page, 0);
@@ -393,7 +383,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct page *old_dir_page;
- struct page *old_page;
+ struct page *old_page, *new_page;
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
@@ -415,7 +405,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
ilock = mutex_lock_op(sbi);
if (new_inode) {
- struct page *new_page;
err = -ENOTEMPTY;
if (old_dir_entry && !f2fs_empty_dir(new_inode))
@@ -427,14 +416,28 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_entry)
goto out_dir;
+ err = acquire_orphan_inode(sbi);
+ if (err)
+ goto put_out_dir;
+
+ if (update_dent_inode(old_inode, &new_dentry->d_name)) {
+ release_orphan_inode(sbi);
+ goto put_out_dir;
+ }
+
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_inode->i_ctime = CURRENT_TIME;
if (old_dir_entry)
drop_nlink(new_inode);
drop_nlink(new_inode);
+
if (!new_inode->i_nlink)
add_orphan_inode(sbi, new_inode->i_ino);
+ else
+ release_orphan_inode(sbi);
+
+ update_inode_page(old_inode);
update_inode_page(new_inode);
} else {
err = f2fs_add_link(new_dentry, old_inode);
@@ -467,6 +470,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
mutex_unlock_op(sbi, ilock);
return 0;
+put_out_dir:
+ f2fs_put_page(new_page, 1);
out_dir:
if (old_dir_entry) {
kunmap(old_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b418aee09573..51ef27894433 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -315,9 +315,10 @@ cache:
* The maximum depth is four.
* Offset[0] will have raw inode offset.
*/
-static int get_node_path(long block, int offset[4], unsigned int noffset[4])
+static int get_node_path(struct f2fs_inode_info *fi, long block,
+ int offset[4], unsigned int noffset[4])
{
- const long direct_index = ADDRS_PER_INODE;
+ const long direct_index = ADDRS_PER_INODE(fi);
const long direct_blks = ADDRS_PER_BLOCK;
const long dptrs_per_blk = NIDS_PER_BLOCK;
const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -405,7 +406,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
int level, i;
int err = 0;
- level = get_node_path(index, offset, noffset);
+ level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
nids[0] = dn->inode->i_ino;
npage[0] = dn->inode_page;
@@ -565,7 +566,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
return PTR_ERR(page);
}
- rn = (struct f2fs_node *)page_address(page);
+ rn = F2FS_NODE(page);
if (depth < 3) {
for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
child_nid = le32_to_cpu(rn->in.nid[i]);
@@ -687,7 +688,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
- level = get_node_path(from, offset, noffset);
+ level = get_node_path(F2FS_I(inode), from, offset, noffset);
restart:
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -698,7 +699,7 @@ restart:
set_new_dnode(&dn, inode, page, NULL, 0);
unlock_page(page);
- rn = page_address(page);
+ rn = F2FS_NODE(page);
switch (level) {
case 0:
case 1:
@@ -771,6 +772,33 @@ fail:
return err > 0 ? 0 : err;
}
+int truncate_xattr_node(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ nid_t nid = F2FS_I(inode)->i_xattr_nid;
+ struct dnode_of_data dn;
+ struct page *npage;
+
+ if (!nid)
+ return 0;
+
+ npage = get_node_page(sbi, nid);
+ if (IS_ERR(npage))
+ return PTR_ERR(npage);
+
+ F2FS_I(inode)->i_xattr_nid = 0;
+
+ /* need to do checkpoint during fsync */
+ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
+
+ set_new_dnode(&dn, inode, page, npage, nid);
+
+ if (page)
+ dn.inode_page_locked = 1;
+ truncate_node(&dn);
+ return 0;
+}
+
/*
* Caller should grab and release a mutex by calling mutex_lock_op() and
* mutex_unlock_op().
@@ -781,22 +809,16 @@ int remove_inode_page(struct inode *inode)
struct page *page;
nid_t ino = inode->i_ino;
struct dnode_of_data dn;
+ int err;
page = get_node_page(sbi, ino);
if (IS_ERR(page))
return PTR_ERR(page);
- if (F2FS_I(inode)->i_xattr_nid) {
- nid_t nid = F2FS_I(inode)->i_xattr_nid;
- struct page *npage = get_node_page(sbi, nid);
-
- if (IS_ERR(npage))
- return PTR_ERR(npage);
-
- F2FS_I(inode)->i_xattr_nid = 0;
- set_new_dnode(&dn, inode, page, npage, nid);
- dn.inode_page_locked = 1;
- truncate_node(&dn);
+ err = truncate_xattr_node(inode, page);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
}
/* 0 is possible, after f2fs_new_inode() is failed */
@@ -833,29 +855,32 @@ struct page *new_node_page(struct dnode_of_data *dn,
if (!page)
return ERR_PTR(-ENOMEM);
- get_node_info(sbi, dn->nid, &old_ni);
+ if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+ err = -ENOSPC;
+ goto fail;
+ }
- SetPageUptodate(page);
- fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
+ get_node_info(sbi, dn->nid, &old_ni);
/* Reinitialize old_ni with new node page */
BUG_ON(old_ni.blk_addr != NULL_ADDR);
new_ni = old_ni;
new_ni.ino = dn->inode->i_ino;
-
- if (!inc_valid_node_count(sbi, dn->inode, 1)) {
- err = -ENOSPC;
- goto fail;
- }
set_node_addr(sbi, &new_ni, NEW_ADDR);
+
+ fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ if (ofs == XATTR_NODE_OFFSET)
+ F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
dn->node_page = page;
if (ipage)
update_inode(dn->inode, ipage);
else
sync_inode_page(dn);
- set_page_dirty(page);
if (ofs == 0)
inc_valid_inode_count(sbi);
@@ -916,7 +941,6 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_put_page(apage, 0);
else if (err == LOCKED_PAGE)
f2fs_put_page(apage, 1);
- return;
}
struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
@@ -1167,9 +1191,9 @@ static int f2fs_write_node_page(struct page *page,
/*
* It is very important to gather dirty pages and write at once, so that we can
* submit a big bio without interfering other data writes.
- * Be default, 512 pages (2MB), a segment size, is quite reasonable.
+ * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
*/
-#define COLLECT_DIRTY_NODES 512
+#define COLLECT_DIRTY_NODES 1536
static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1187,9 +1211,10 @@ static int f2fs_write_node_pages(struct address_space *mapping,
return 0;
/* if mounting is failed, skip writing node pages */
- wbc->nr_to_write = max_hw_blocks(sbi);
+ wbc->nr_to_write = 3 * max_hw_blocks(sbi);
sync_node_pages(sbi, 0, wbc);
- wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write);
+ wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
+ wbc->nr_to_write);
return 0;
}
@@ -1444,6 +1469,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
+ if (!nid)
+ return;
+
spin_lock(&nm_i->free_nid_list_lock);
i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
BUG_ON(!i || i->state != NID_ALLOC);
@@ -1484,8 +1512,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
- src = (struct f2fs_node *)page_address(page);
- dst = (struct f2fs_node *)page_address(ipage);
+ src = F2FS_NODE(page);
+ dst = F2FS_NODE(ipage);
memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
dst->i.i_size = 0;
@@ -1515,8 +1543,8 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
/* alloc temporal page for read node */
page = alloc_page(GFP_NOFS | __GFP_ZERO);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ if (!page)
+ return -ENOMEM;
lock_page(page);
/* scan the node segment */
@@ -1535,7 +1563,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
goto out;
lock_page(page);
- rn = (struct f2fs_node *)page_address(page);
+ rn = F2FS_NODE(page);
sum_entry->nid = rn->footer.nid;
sum_entry->version = 0;
sum_entry->ofs_in_node = 0;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index c65fb4f4230f..3496bb3e15dc 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -155,8 +155,7 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
static inline void fill_node_footer(struct page *page, nid_t nid,
nid_t ino, unsigned int ofs, bool reset)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(page);
if (reset)
memset(rn, 0, sizeof(*rn));
rn->footer.nid = cpu_to_le32(nid);
@@ -166,10 +165,8 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
static inline void copy_node_footer(struct page *dst, struct page *src)
{
- void *src_addr = page_address(src);
- void *dst_addr = page_address(dst);
- struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
- struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
+ struct f2fs_node *src_rn = F2FS_NODE(src);
+ struct f2fs_node *dst_rn = F2FS_NODE(dst);
memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
}
@@ -177,45 +174,40 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
{
struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(page);
+
rn->footer.cp_ver = ckpt->checkpoint_ver;
rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
}
static inline nid_t ino_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le32_to_cpu(rn->footer.ino);
}
static inline nid_t nid_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le32_to_cpu(rn->footer.nid);
}
static inline unsigned int ofs_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
unsigned flag = le32_to_cpu(rn->footer.flag);
return flag >> OFFSET_BIT_SHIFT;
}
static inline unsigned long long cpver_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le64_to_cpu(rn->footer.cp_ver);
}
static inline block_t next_blkaddr_of_node(struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(node_page);
return le32_to_cpu(rn->footer.next_blkaddr);
}
@@ -237,6 +229,10 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
static inline bool IS_DNODE(struct page *node_page)
{
unsigned int ofs = ofs_of_node(node_page);
+
+ if (ofs == XATTR_NODE_OFFSET)
+ return false;
+
if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
ofs == 5 + 2 * NIDS_PER_BLOCK)
return false;
@@ -250,7 +246,7 @@ static inline bool IS_DNODE(struct page *node_page)
static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
{
- struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+ struct f2fs_node *rn = F2FS_NODE(p);
wait_on_page_writeback(p);
@@ -263,7 +259,8 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
static inline nid_t get_nid(struct page *p, int off, bool i)
{
- struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+ struct f2fs_node *rn = F2FS_NODE(p);
+
if (i)
return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
return le32_to_cpu(rn->in.nid[off]);
@@ -314,8 +311,7 @@ static inline void clear_cold_data(struct page *page)
static inline int is_node(struct page *page, int type)
{
- void *kaddr = page_address(page);
- struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ struct f2fs_node *rn = F2FS_NODE(page);
return le32_to_cpu(rn->footer.flag) & (1 << type);
}
@@ -325,7 +321,7 @@ static inline int is_node(struct page *page, int type)
static inline void set_cold_node(struct inode *inode, struct page *page)
{
- struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
+ struct f2fs_node *rn = F2FS_NODE(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (S_ISDIR(inode->i_mode))
@@ -337,7 +333,7 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
static inline void set_mark(struct page *page, int mark, int type)
{
- struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
+ struct f2fs_node *rn = F2FS_NODE(page);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (mark)
flag |= (0x1 << type);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d56d951c2253..51ef5eec33d7 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
static int recover_dentry(struct page *ipage, struct inode *inode)
{
- void *kaddr = page_address(ipage);
- struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
+ struct f2fs_node *raw_node = F2FS_NODE(ipage);
struct f2fs_inode *raw_inode = &(raw_node->i);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
@@ -93,8 +92,7 @@ out:
static int recover_inode(struct inode *inode, struct page *node_page)
{
- void *kaddr = page_address(node_page);
- struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
+ struct f2fs_node *raw_node = F2FS_NODE(node_page);
struct f2fs_inode *raw_inode = &(raw_node->i);
if (!IS_INODE(node_page))
@@ -119,7 +117,7 @@ static int recover_inode(struct inode *inode, struct page *node_page)
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
- unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+ unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
struct page *page;
block_t blkaddr;
@@ -131,8 +129,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
/* read node page */
page = alloc_page(GFP_F2FS_ZERO);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ if (!page)
+ return -ENOMEM;
lock_page(page);
while (1) {
@@ -215,6 +213,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
void *kaddr;
struct inode *inode;
struct page *node_page;
+ unsigned int offset;
block_t bidx;
int i;
@@ -259,8 +258,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
return PTR_ERR(node_page);
- bidx = start_bidx_of_node(ofs_of_node(node_page)) +
- le16_to_cpu(sum.ofs_in_node);
+
+ offset = ofs_of_node(node_page);
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
@@ -269,6 +268,9 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
if (IS_ERR(inode))
return PTR_ERR(inode);
+ bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
+ le16_to_cpu(sum.ofs_in_node);
+
truncate_hole(inode, bidx, bidx + 1);
iput(inode);
return 0;
@@ -277,6 +279,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page, block_t blkaddr)
{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int start, end;
struct dnode_of_data dn;
struct f2fs_summary sum;
@@ -284,9 +287,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
int err = 0, recovered = 0;
int ilock;
- start = start_bidx_of_node(ofs_of_node(page));
+ start = start_bidx_of_node(ofs_of_node(page), fi);
if (IS_INODE(page))
- end = start + ADDRS_PER_INODE;
+ end = start + ADDRS_PER_INODE(fi);
else
end = start + ADDRS_PER_BLOCK;
@@ -357,7 +360,7 @@ err:
static int recover_data(struct f2fs_sb_info *sbi,
struct list_head *head, int type)
{
- unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+ unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
struct page *page;
int err = 0;
@@ -369,7 +372,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
/* read node page */
page = alloc_page(GFP_NOFS | __GFP_ZERO);
- if (IS_ERR(page))
+ if (!page)
return -ENOMEM;
lock_page(page);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a86d125a9885..09af9c7b0f52 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -117,7 +117,6 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
}
mutex_unlock(&dirty_i->seglist_lock);
- return;
}
/*
@@ -261,7 +260,6 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
void *addr = curseg->sum_blk;
addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
memcpy(addr, sum, sizeof(struct f2fs_summary));
- return;
}
/*
@@ -542,12 +540,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- if (force) {
+ if (force)
new_curseg(sbi, type, true);
- goto out;
- }
-
- if (type == CURSEG_WARM_NODE)
+ else if (type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
new_curseg(sbi, type, false);
@@ -555,11 +550,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
change_curseg(sbi, type, true);
else
new_curseg(sbi, type, false);
-out:
#ifdef CONFIG_F2FS_STAT_FS
sbi->segment_count[curseg->alloc_type]++;
#endif
- return;
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -611,18 +604,12 @@ static void f2fs_end_io_write(struct bio *bio, int err)
struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
{
struct bio *bio;
- struct bio_private *priv;
-retry:
- priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
- if (!priv) {
- cond_resched();
- goto retry;
- }
/* No failure on bio allocation */
bio = bio_alloc(GFP_NOIO, npages);
bio->bi_bdev = bdev;
- bio->bi_private = priv;
+ bio->bi_private = NULL;
+
return bio;
}
@@ -681,8 +668,17 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
do_submit_bio(sbi, type, false);
alloc_new:
if (sbi->bio[type] == NULL) {
+ struct bio_private *priv;
+retry:
+ priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
+ if (!priv) {
+ cond_resched();
+ goto retry;
+ }
+
sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ sbi->bio[type]->bi_private = priv;
/*
* The end_io will be assigned at the sumbission phase.
* Until then, let bio_add_page() merge consecutive IOs as much
@@ -702,6 +698,16 @@ alloc_new:
trace_f2fs_submit_write_page(page, blk_addr, type);
}
+void f2fs_wait_on_page_writeback(struct page *page,
+ enum page_type type, bool sync)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ if (PageWriteback(page)) {
+ f2fs_submit_bio(sbi, type, sync);
+ wait_on_page_writeback(page);
+ }
+}
+
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1179,7 +1185,6 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
- return;
}
int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 062424a0e4c3..bdd10eab8c40 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -142,6 +142,7 @@ struct victim_sel_policy {
int alloc_mode; /* LFS or SSR */
int gc_mode; /* GC_CB or GC_GREEDY */
unsigned long *dirty_segmap; /* dirty segment bitmap */
+ unsigned int max_search; /* maximum # of segments to search */
unsigned int offset; /* last scanned bitmap offset */
unsigned int ofs_unit; /* bitmap search unit */
unsigned int min_cost; /* minimum cost */
@@ -453,7 +454,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
- return (free_sections(sbi) < overprovision_sections(sbi));
+ return ((prefree_segments(sbi) / sbi->segs_per_sec)
+ + free_sections(sbi) < overprovision_sections(sbi));
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -470,7 +472,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
static inline int utilization(struct f2fs_sb_info *sbi)
{
- return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count);
+ return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count);
}
/*
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 75c7dc363e92..13d0a0fe49dd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -18,20 +18,25 @@
#include <linux/parser.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
#include <linux/random.h>
#include <linux/exportfs.h>
#include <linux/blkdev.h>
#include <linux/f2fs_fs.h>
+#include <linux/sysfs.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include "xattr.h"
+#include "gc.h"
#define CREATE_TRACE_POINTS
#include <trace/events/f2fs.h>
+static struct proc_dir_entry *f2fs_proc_root;
static struct kmem_cache *f2fs_inode_cachep;
+static struct kset *f2fs_kset;
enum {
Opt_gc_background,
@@ -42,6 +47,7 @@ enum {
Opt_noacl,
Opt_active_logs,
Opt_disable_ext_identify,
+ Opt_inline_xattr,
Opt_err,
};
@@ -54,9 +60,117 @@ static match_table_t f2fs_tokens = {
{Opt_noacl, "noacl"},
{Opt_active_logs, "active_logs=%u"},
{Opt_disable_ext_identify, "disable_ext_identify"},
+ {Opt_inline_xattr, "inline_xattr"},
{Opt_err, NULL},
};
+/* Sysfs support for f2fs */
+struct f2fs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
+ ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
+ const char *, size_t);
+ int offset;
+};
+
+static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+ unsigned int *ui;
+
+ if (!gc_kth)
+ return -EINVAL;
+
+ ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+ unsigned long t;
+ unsigned int *ui;
+ ssize_t ret;
+
+ if (!gc_kth)
+ return -EINVAL;
+
+ ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret < 0)
+ return ret;
+ *ui = t;
+ return count;
+}
+
+static ssize_t f2fs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void f2fs_sb_release(struct kobject *kobj)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \
+static struct f2fs_attr f2fs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .offset = offsetof(struct f2fs_gc_kthread, _elname), \
+}
+
+#define F2FS_RW_ATTR(name, elname) \
+ F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname)
+
+F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time);
+F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time);
+F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
+F2FS_RW_ATTR(gc_idle, gc_idle);
+
+#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
+static struct attribute *f2fs_attrs[] = {
+ ATTR_LIST(gc_min_sleep_time),
+ ATTR_LIST(gc_max_sleep_time),
+ ATTR_LIST(gc_no_gc_sleep_time),
+ ATTR_LIST(gc_idle),
+ NULL,
+};
+
+static const struct sysfs_ops f2fs_attr_ops = {
+ .show = f2fs_attr_show,
+ .store = f2fs_attr_store,
+};
+
+static struct kobj_type f2fs_ktype = {
+ .default_attrs = f2fs_attrs,
+ .sysfs_ops = &f2fs_attr_ops,
+ .release = f2fs_sb_release,
+};
+
void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
struct va_format vaf;
@@ -126,11 +240,18 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_nouser_xattr:
clear_opt(sbi, XATTR_USER);
break;
+ case Opt_inline_xattr:
+ set_opt(sbi, INLINE_XATTR);
+ break;
#else
case Opt_nouser_xattr:
f2fs_msg(sb, KERN_INFO,
"nouser_xattr options not supported");
break;
+ case Opt_inline_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "inline_xattr options not supported");
+ break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
case Opt_noacl:
@@ -180,6 +301,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
set_inode_flag(fi, FI_NEW_INODE);
+ if (test_opt(F2FS_SB(sb), INLINE_XATTR))
+ set_inode_flag(fi, FI_INLINE_XATTR);
+
return &fi->vfs_inode;
}
@@ -205,7 +329,6 @@ static int f2fs_drop_inode(struct inode *inode)
static void f2fs_dirty_inode(struct inode *inode, int flags)
{
set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
- return;
}
static void f2fs_i_callback(struct rcu_head *head)
@@ -223,6 +346,12 @@ static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ if (sbi->s_proc) {
+ remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry(sb->s_id, f2fs_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
+
f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
@@ -236,6 +365,8 @@ static void f2fs_put_super(struct super_block *sb)
destroy_segment_manager(sbi);
kfree(sbi->ckpt);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
sb->s_fs_info = NULL;
brelse(sbi->raw_super_buf);
@@ -325,6 +456,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",user_xattr");
else
seq_puts(seq, ",nouser_xattr");
+ if (test_opt(sbi, INLINE_XATTR))
+ seq_puts(seq, ",inline_xattr");
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
if (test_opt(sbi, POSIX_ACL))
@@ -340,6 +473,36 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static int segment_info_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+ int i;
+
+ for (i = 0; i < total_segs; i++) {
+ seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1));
+ if (i != 0 && (i % 10) == 0)
+ seq_puts(seq, "\n");
+ else
+ seq_puts(seq, " ");
+ }
+ return 0;
+}
+
+static int segment_info_open_fs(struct inode *inode, struct file *file)
+{
+ return single_open(file, segment_info_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations f2fs_seq_segment_info_fops = {
+ .owner = THIS_MODULE,
+ .open = segment_info_open_fs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int f2fs_remount(struct super_block *sb, int *flags, char *data)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -455,7 +618,7 @@ static const struct export_operations f2fs_export_ops = {
static loff_t max_file_size(unsigned bits)
{
- loff_t result = ADDRS_PER_INODE;
+ loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
loff_t leaf_count = ADDRS_PER_BLOCK;
/* two direct node blocks */
@@ -766,6 +929,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto fail;
+ if (f2fs_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
+
+ if (sbi->s_proc)
+ proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_segment_info_fops, sb);
+
if (test_opt(sbi, DISCARD)) {
struct request_queue *q = bdev_get_queue(sb->s_bdev);
if (!blk_queue_discard(q))
@@ -774,6 +944,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
"the device does not support discard");
}
+ sbi->s_kobj.kset = f2fs_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ goto fail;
+
return 0;
fail:
stop_gc_thread(sbi);
@@ -841,29 +1018,49 @@ static int __init init_f2fs_fs(void)
goto fail;
err = create_node_manager_caches();
if (err)
- goto fail;
+ goto free_inodecache;
err = create_gc_caches();
if (err)
- goto fail;
+ goto free_node_manager_caches;
err = create_checkpoint_caches();
if (err)
- goto fail;
+ goto free_gc_caches;
+ f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
+ if (!f2fs_kset) {
+ err = -ENOMEM;
+ goto free_checkpoint_caches;
+ }
err = register_filesystem(&f2fs_fs_type);
if (err)
- goto fail;
+ goto free_kset;
f2fs_create_root_stats();
+ f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ return 0;
+
+free_kset:
+ kset_unregister(f2fs_kset);
+free_checkpoint_caches:
+ destroy_checkpoint_caches();
+free_gc_caches:
+ destroy_gc_caches();
+free_node_manager_caches:
+ destroy_node_manager_caches();
+free_inodecache:
+ destroy_inodecache();
fail:
return err;
}
static void __exit exit_f2fs_fs(void)
{
+ remove_proc_entry("fs/f2fs", NULL);
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
destroy_checkpoint_caches();
destroy_gc_caches();
destroy_node_manager_caches();
destroy_inodecache();
+ kset_unregister(f2fs_kset);
}
module_init(init_f2fs_fs)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3ab07ecd86ca..1ac8a5f6e380 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -246,40 +246,170 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
return handler;
}
+static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int name_index,
+ size_t name_len, const char *name)
+{
+ struct f2fs_xattr_entry *entry;
+
+ list_for_each_xattr(entry, base_addr) {
+ if (entry->e_name_index != name_index)
+ continue;
+ if (entry->e_name_len != name_len)
+ continue;
+ if (!memcmp(entry->e_name, name, name_len))
+ break;
+ }
+ return entry;
+}
+
+static void *read_all_xattrs(struct inode *inode, struct page *ipage)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_xattr_header *header;
+ size_t size = PAGE_SIZE, inline_size = 0;
+ void *txattr_addr;
+
+ inline_size = inline_xattr_size(inode);
+
+ txattr_addr = kzalloc(inline_size + size, GFP_KERNEL);
+ if (!txattr_addr)
+ return NULL;
+
+ /* read from inline xattr */
+ if (inline_size) {
+ struct page *page = NULL;
+ void *inline_addr;
+
+ if (ipage) {
+ inline_addr = inline_xattr_addr(ipage);
+ } else {
+ page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page))
+ goto fail;
+ inline_addr = inline_xattr_addr(page);
+ }
+ memcpy(txattr_addr, inline_addr, inline_size);
+ f2fs_put_page(page, 1);
+ }
+
+ /* read from xattr node block */
+ if (F2FS_I(inode)->i_xattr_nid) {
+ struct page *xpage;
+ void *xattr_addr;
+
+ /* The inode already has an extended attribute block. */
+ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+ if (IS_ERR(xpage))
+ goto fail;
+
+ xattr_addr = page_address(xpage);
+ memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
+ f2fs_put_page(xpage, 1);
+ }
+
+ header = XATTR_HDR(txattr_addr);
+
+ /* never been allocated xattrs */
+ if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
+ header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
+ header->h_refcount = cpu_to_le32(1);
+ }
+ return txattr_addr;
+fail:
+ kzfree(txattr_addr);
+ return NULL;
+}
+
+static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
+ void *txattr_addr, struct page *ipage)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ size_t inline_size = 0;
+ void *xattr_addr;
+ struct page *xpage;
+ nid_t new_nid = 0;
+ int err;
+
+ inline_size = inline_xattr_size(inode);
+
+ if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
+ if (!alloc_nid(sbi, &new_nid))
+ return -ENOSPC;
+
+ /* write to inline xattr */
+ if (inline_size) {
+ struct page *page = NULL;
+ void *inline_addr;
+
+ if (ipage) {
+ inline_addr = inline_xattr_addr(ipage);
+ } else {
+ page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page)) {
+ alloc_nid_failed(sbi, new_nid);
+ return PTR_ERR(page);
+ }
+ inline_addr = inline_xattr_addr(page);
+ }
+ memcpy(inline_addr, txattr_addr, inline_size);
+ f2fs_put_page(page, 1);
+
+ /* no need to use xattr node block */
+ if (hsize <= inline_size) {
+ err = truncate_xattr_node(inode, ipage);
+ alloc_nid_failed(sbi, new_nid);
+ return err;
+ }
+ }
+
+ /* write to xattr node block */
+ if (F2FS_I(inode)->i_xattr_nid) {
+ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+ if (IS_ERR(xpage)) {
+ alloc_nid_failed(sbi, new_nid);
+ return PTR_ERR(xpage);
+ }
+ BUG_ON(new_nid);
+ } else {
+ struct dnode_of_data dn;
+ set_new_dnode(&dn, inode, NULL, NULL, new_nid);
+ xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
+ if (IS_ERR(xpage)) {
+ alloc_nid_failed(sbi, new_nid);
+ return PTR_ERR(xpage);
+ }
+ alloc_nid_done(sbi, new_nid);
+ }
+
+ xattr_addr = page_address(xpage);
+ memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE -
+ sizeof(struct node_footer));
+ set_page_dirty(xpage);
+ f2fs_put_page(xpage, 1);
+
+ /* need to checkpoint during fsync */
+ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
+ return 0;
+}
+
int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
void *buffer, size_t buffer_size)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_xattr_entry *entry;
- struct page *page;
void *base_addr;
- int error = 0, found = 0;
+ int error = 0;
size_t value_len, name_len;
if (name == NULL)
return -EINVAL;
name_len = strlen(name);
- if (!fi->i_xattr_nid)
- return -ENODATA;
+ base_addr = read_all_xattrs(inode, NULL);
+ if (!base_addr)
+ return -ENOMEM;
- page = get_node_page(sbi, fi->i_xattr_nid);
- if (IS_ERR(page))
- return PTR_ERR(page);
- base_addr = page_address(page);
-
- list_for_each_xattr(entry, base_addr) {
- if (entry->e_name_index != name_index)
- continue;
- if (entry->e_name_len != name_len)
- continue;
- if (!memcmp(entry->e_name, name, name_len)) {
- found = 1;
- break;
- }
- }
- if (!found) {
+ entry = __find_xattr(base_addr, name_index, name_len, name);
+ if (IS_XATTR_LAST_ENTRY(entry)) {
error = -ENODATA;
goto cleanup;
}
@@ -298,28 +428,21 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
error = value_len;
cleanup:
- f2fs_put_page(page, 1);
+ kzfree(base_addr);
return error;
}
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct inode *inode = dentry->d_inode;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_xattr_entry *entry;
- struct page *page;
void *base_addr;
int error = 0;
size_t rest = buffer_size;
- if (!fi->i_xattr_nid)
- return 0;
-
- page = get_node_page(sbi, fi->i_xattr_nid);
- if (IS_ERR(page))
- return PTR_ERR(page);
- base_addr = page_address(page);
+ base_addr = read_all_xattrs(inode, NULL);
+ if (!base_addr)
+ return -ENOMEM;
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
@@ -342,7 +465,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
}
error = buffer_size - rest;
cleanup:
- f2fs_put_page(page, 1);
+ kzfree(base_addr);
return error;
}
@@ -351,14 +474,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_xattr_header *header = NULL;
struct f2fs_xattr_entry *here, *last;
- struct page *page;
void *base_addr;
- int error, found, free, newsize;
+ int found, newsize;
size_t name_len;
- char *pval;
int ilock;
+ __u32 new_hsize;
+ int error = -ENOMEM;
if (name == NULL)
return -EINVAL;
@@ -368,67 +490,21 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
name_len = strlen(name);
- if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN)
+ if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode))
return -ERANGE;
f2fs_balance_fs(sbi);
ilock = mutex_lock_op(sbi);
- if (!fi->i_xattr_nid) {
- /* Allocate new attribute block */
- struct dnode_of_data dn;
-
- if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
- error = -ENOSPC;
- goto exit;
- }
- set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
- mark_inode_dirty(inode);
-
- page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
- if (IS_ERR(page)) {
- alloc_nid_failed(sbi, fi->i_xattr_nid);
- fi->i_xattr_nid = 0;
- error = PTR_ERR(page);
- goto exit;
- }
-
- alloc_nid_done(sbi, fi->i_xattr_nid);
- base_addr = page_address(page);
- header = XATTR_HDR(base_addr);
- header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
- header->h_refcount = cpu_to_le32(1);
- } else {
- /* The inode already has an extended attribute block. */
- page = get_node_page(sbi, fi->i_xattr_nid);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
- goto exit;
- }
-
- base_addr = page_address(page);
- header = XATTR_HDR(base_addr);
- }
-
- if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
- error = -EIO;
- goto cleanup;
- }
+ base_addr = read_all_xattrs(inode, ipage);
+ if (!base_addr)
+ goto exit;
/* find entry with wanted name. */
- found = 0;
- list_for_each_xattr(here, base_addr) {
- if (here->e_name_index != name_index)
- continue;
- if (here->e_name_len != name_len)
- continue;
- if (!memcmp(here->e_name, name, name_len)) {
- found = 1;
- break;
- }
- }
+ here = __find_xattr(base_addr, name_index, name_len, name);
+ found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
last = here;
while (!IS_XATTR_LAST_ENTRY(last))
@@ -439,22 +515,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
/* 1. Check space */
if (value) {
- /* If value is NULL, it is remove operation.
+ int free;
+ /*
+ * If value is NULL, it is remove operation.
* In case of update operation, we caculate free.
*/
- free = MIN_OFFSET - ((char *)last - (char *)header);
+ free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
if (found)
free = free - ENTRY_SIZE(here);
if (free < newsize) {
error = -ENOSPC;
- goto cleanup;
+ goto exit;
}
}
/* 2. Remove old entry */
if (found) {
- /* If entry is found, remove old entry.
+ /*
+ * If entry is found, remove old entry.
* If not found, remove operation is not needed.
*/
struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
@@ -465,10 +544,15 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
memset(last, 0, oldsize);
}
+ new_hsize = (char *)last - (char *)base_addr;
+
/* 3. Write new entry */
if (value) {
- /* Before we come here, old entry is removed.
- * We just write new entry. */
+ char *pval;
+ /*
+ * Before we come here, old entry is removed.
+ * We just write new entry.
+ */
memset(last, 0, newsize);
last->e_name_index = name_index;
last->e_name_len = name_len;
@@ -476,26 +560,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
pval = last->e_name + name_len;
memcpy(pval, value, value_len);
last->e_value_size = cpu_to_le16(value_len);
+ new_hsize += newsize;
}
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ error = write_all_xattrs(inode, new_hsize, base_addr, ipage);
+ if (error)
+ goto exit;
if (is_inode_flag_set(fi, FI_ACL_MODE)) {
inode->i_mode = fi->i_acl_mode;
inode->i_ctime = CURRENT_TIME;
clear_inode_flag(fi, FI_ACL_MODE);
}
+
if (ipage)
update_inode(inode, ipage);
else
update_inode_page(inode);
- mutex_unlock_op(sbi, ilock);
-
- return 0;
-cleanup:
- f2fs_put_page(page, 1);
exit:
mutex_unlock_op(sbi, ilock);
+ kzfree(base_addr);
return error;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 3c0817bef25d..02a08fb88a15 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -51,7 +51,7 @@ struct f2fs_xattr_entry {
#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr))
-#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1))
+#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1))
#define XATTR_ROUND (3)
#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
@@ -69,17 +69,16 @@ struct f2fs_xattr_entry {
!IS_XATTR_LAST_ENTRY(entry);\
entry = XATTR_NEXT_ENTRY(entry))
+#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \
+ sizeof(struct node_footer) - sizeof(__u32))
-#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \
- sizeof(struct node_footer) - \
- sizeof(__u32))
-
-#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \
- sizeof(struct f2fs_xattr_entry))
+#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \
+ sizeof(struct f2fs_xattr_header) - \
+ sizeof(struct f2fs_xattr_entry))
/*
* On-disk structure of f2fs_xattr
- * We use only 1 block for xattr.
+ * We use inline xattrs space + 1 block for xattr.
*
* +--------------------+
* | f2fs_xattr_header |
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6599222536eb..65343c3741ff 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -730,14 +730,14 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+ BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
O_RDONLY | O_WRONLY | O_RDWR |
O_CREAT | O_EXCL | O_NOCTTY |
O_TRUNC | O_APPEND | /* O_NONBLOCK | */
__O_SYNC | O_DSYNC | FASYNC |
O_DIRECT | O_LARGEFILE | O_DIRECTORY |
O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH
+ __FMODE_EXEC | O_PATH | __O_TMPFILE
));
fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index b44e4c559786..322cd37626cb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -385,6 +385,10 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
*/
void file_sb_list_add(struct file *file, struct super_block *sb)
{
+ if (likely(!(file->f_mode & FMODE_WRITE)))
+ return;
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return;
lg_local_lock(&files_lglock);
__file_sb_list_add(file, sb);
lg_local_unlock(&files_lglock);
@@ -450,8 +454,6 @@ void mark_files_ro(struct super_block *sb)
lg_global_lock(&files_lglock);
do_file_list_for_each_entry(sb, f) {
- if (!S_ISREG(file_inode(f)->i_mode))
- continue;
if (!file_count(f))
continue;
if (!(f->f_mode & FMODE_WRITE))
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 0e91a3c9fdb2..318e8433527c 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -558,3 +558,74 @@ void __fscache_cookie_put(struct fscache_cookie *cookie)
_leave("");
}
+
+/*
+ * check the consistency between the netfs inode and the backing cache
+ *
+ * NOTE: it only serves no-index type
+ */
+int __fscache_check_consistency(struct fscache_cookie *cookie)
+{
+ struct fscache_operation *op;
+ struct fscache_object *object;
+ int ret;
+
+ _enter("%p,", cookie);
+
+ ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+
+ if (fscache_wait_for_deferred_lookup(cookie) < 0)
+ return -ERESTARTSYS;
+
+ if (hlist_empty(&cookie->backing_objects))
+ return 0;
+
+ op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
+ if (!op)
+ return -ENOMEM;
+
+ fscache_operation_init(op, NULL, NULL);
+ op->flags = FSCACHE_OP_MYTHREAD |
+ (1 << FSCACHE_OP_WAITING);
+
+ spin_lock(&cookie->lock);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto inconsistent;
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+ if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+ goto inconsistent;
+
+ op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+
+ atomic_inc(&cookie->n_active);
+ if (fscache_submit_op(object, op) < 0)
+ goto submit_failed;
+
+ /* the work queue now carries its own ref on the object */
+ spin_unlock(&cookie->lock);
+
+ ret = fscache_wait_for_operation_activation(object, op,
+ NULL, NULL, NULL);
+ if (ret == 0) {
+ /* ask the cache to honour the operation */
+ ret = object->cache->ops->check_consistency(op);
+ fscache_op_complete(op, false);
+ } else if (ret == -ENOBUFS) {
+ ret = 0;
+ }
+
+ fscache_put_operation(op);
+ _leave(" = %d", ret);
+ return ret;
+
+submit_failed:
+ atomic_dec(&cookie->n_active);
+inconsistent:
+ spin_unlock(&cookie->lock);
+ kfree(op);
+ _leave(" = -ESTALE");
+ return -ESTALE;
+}
+EXPORT_SYMBOL(__fscache_check_consistency);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 12d505bedb5c..4226f6680b06 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -130,6 +130,12 @@ extern void fscache_operation_gc(struct work_struct *);
/*
* page.c
*/
+extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
+extern int fscache_wait_for_operation_activation(struct fscache_object *,
+ struct fscache_operation *,
+ atomic_t *,
+ atomic_t *,
+ void (*)(struct fscache_operation *));
extern void fscache_invalidate_writes(struct fscache_cookie *);
/*
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index d479ab3c63e4..8702b732109a 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -278,7 +278,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
/*
* wait for a deferred lookup to complete
*/
-static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
{
unsigned long jif;
@@ -322,42 +322,46 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
/*
* wait for an object to become active (or dead)
*/
-static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
- struct fscache_retrieval *op,
- atomic_t *stat_op_waits,
- atomic_t *stat_object_dead)
+int fscache_wait_for_operation_activation(struct fscache_object *object,
+ struct fscache_operation *op,
+ atomic_t *stat_op_waits,
+ atomic_t *stat_object_dead,
+ void (*do_cancel)(struct fscache_operation *))
{
int ret;
- if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags))
+ if (!test_bit(FSCACHE_OP_WAITING, &op->flags))
goto check_if_dead;
_debug(">>> WT");
- fscache_stat(stat_op_waits);
- if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+ if (stat_op_waits)
+ fscache_stat(stat_op_waits);
+ if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
fscache_wait_bit_interruptible,
TASK_INTERRUPTIBLE) != 0) {
- ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
+ ret = fscache_cancel_op(op, do_cancel);
if (ret == 0)
return -ERESTARTSYS;
/* it's been removed from the pending queue by another party,
* so we should get to run shortly */
- wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+ wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
fscache_wait_bit, TASK_UNINTERRUPTIBLE);
}
_debug("<<< GO");
check_if_dead:
- if (op->op.state == FSCACHE_OP_ST_CANCELLED) {
- fscache_stat(stat_object_dead);
+ if (op->state == FSCACHE_OP_ST_CANCELLED) {
+ if (stat_object_dead)
+ fscache_stat(stat_object_dead);
_leave(" = -ENOBUFS [cancelled]");
return -ENOBUFS;
}
if (unlikely(fscache_object_is_dead(object))) {
- pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state);
- fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
- fscache_stat(stat_object_dead);
+ pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state);
+ fscache_cancel_op(op, do_cancel);
+ if (stat_object_dead)
+ fscache_stat(stat_object_dead);
return -ENOBUFS;
}
return 0;
@@ -432,10 +436,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
/* we wait for the operation to become active, and then process it
* *here*, in this thread, and not in the thread pool */
- ret = fscache_wait_for_retrieval_activation(
- object, op,
+ ret = fscache_wait_for_operation_activation(
+ object, &op->op,
__fscache_stat(&fscache_n_retrieval_op_waits),
- __fscache_stat(&fscache_n_retrievals_object_dead));
+ __fscache_stat(&fscache_n_retrievals_object_dead),
+ fscache_do_cancel_retrieval);
if (ret < 0)
goto error;
@@ -557,10 +562,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
/* we wait for the operation to become active, and then process it
* *here*, in this thread, and not in the thread pool */
- ret = fscache_wait_for_retrieval_activation(
- object, op,
+ ret = fscache_wait_for_operation_activation(
+ object, &op->op,
__fscache_stat(&fscache_n_retrieval_op_waits),
- __fscache_stat(&fscache_n_retrievals_object_dead));
+ __fscache_stat(&fscache_n_retrievals_object_dead),
+ fscache_do_cancel_retrieval);
if (ret < 0)
goto error;
@@ -658,10 +664,11 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
fscache_stat(&fscache_n_alloc_ops);
- ret = fscache_wait_for_retrieval_activation(
- object, op,
+ ret = fscache_wait_for_operation_activation(
+ object, &op->op,
__fscache_stat(&fscache_n_alloc_op_waits),
- __fscache_stat(&fscache_n_allocs_object_dead));
+ __fscache_stat(&fscache_n_allocs_object_dead),
+ fscache_do_cancel_retrieval);
if (ret < 0)
goto error;
@@ -694,6 +701,22 @@ nobufs:
EXPORT_SYMBOL(__fscache_alloc_page);
/*
+ * Unmark pages allocate in the readahead code path (via:
+ * fscache_readpages_or_alloc) after delegating to the base filesystem
+ */
+void __fscache_readpages_cancel(struct fscache_cookie *cookie,
+ struct list_head *pages)
+{
+ struct page *page;
+
+ list_for_each_entry(page, pages, lru) {
+ if (PageFsCache(page))
+ __fscache_uncache_page(cookie, page);
+ }
+}
+EXPORT_SYMBOL(__fscache_readpages_cancel);
+
+/*
* release a write op reference
*/
static void fscache_release_write_op(struct fscache_operation *_op)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index aef34b1e635e..adbfd66b380f 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -568,6 +568,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
}
+static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL);
static ssize_t cuse_class_abort_store(struct device *dev,
struct device_attribute *attr,
@@ -578,12 +579,14 @@ static ssize_t cuse_class_abort_store(struct device *dev,
fuse_abort_conn(&cc->fc);
return count;
}
+static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store);
-static struct device_attribute cuse_class_dev_attrs[] = {
- __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
- __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
- { }
+static struct attribute *cuse_class_dev_attrs[] = {
+ &dev_attr_waiting.attr,
+ &dev_attr_abort.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(cuse_class_dev);
static struct miscdevice cuse_miscdev = {
.minor = MISC_DYNAMIC_MINOR,
@@ -609,7 +612,7 @@ static int __init cuse_init(void)
if (IS_ERR(cuse_class))
return PTR_ERR(cuse_class);
- cuse_class->dev_attrs = cuse_class_dev_attrs;
+ cuse_class->dev_groups = cuse_class_dev_groups;
rc = misc_register(&cuse_miscdev);
if (rc) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1d55f9465400..ef74ad5fd362 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1765,11 +1765,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
/* Look up request on processing list by unique ID */
static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
{
- struct list_head *entry;
+ struct fuse_req *req;
- list_for_each(entry, &fc->processing) {
- struct fuse_req *req;
- req = list_entry(entry, struct fuse_req, list);
+ list_for_each_entry(req, &fc->processing, list) {
if (req->in.h.unique == unique || req->intr_unique == unique)
return req;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0eda52738ec4..3ac91086f41f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -182,10 +182,11 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
struct inode *inode;
struct dentry *parent;
struct fuse_conn *fc;
+ int ret;
inode = ACCESS_ONCE(entry->d_inode);
if (inode && is_bad_inode(inode))
- return 0;
+ goto invalid;
else if (fuse_dentry_time(entry) < get_jiffies_64()) {
int err;
struct fuse_entry_out outarg;
@@ -195,20 +196,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
/* For negative dentries, always do a fresh lookup */
if (!inode)
- return 0;
+ goto invalid;
+ ret = -ECHILD;
if (flags & LOOKUP_RCU)
- return -ECHILD;
+ goto out;
fc = get_fuse_conn(inode);
req = fuse_get_req_nopages(fc);
+ ret = PTR_ERR(req);
if (IS_ERR(req))
- return 0;
+ goto out;
forget = fuse_alloc_forget();
if (!forget) {
fuse_put_request(fc, req);
- return 0;
+ ret = -ENOMEM;
+ goto out;
}
attr_version = fuse_get_attr_version(fc);
@@ -227,7 +231,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
struct fuse_inode *fi = get_fuse_inode(inode);
if (outarg.nodeid != get_node_id(inode)) {
fuse_queue_forget(fc, forget, outarg.nodeid, 1);
- return 0;
+ goto invalid;
}
spin_lock(&fc->lock);
fi->nlookup++;
@@ -235,7 +239,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
}
kfree(forget);
if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
- return 0;
+ goto invalid;
fuse_change_attributes(inode, &outarg.attr,
entry_attr_timeout(&outarg),
@@ -249,7 +253,15 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
dput(parent);
}
}
- return 1;
+ ret = 1;
+out:
+ return ret;
+
+invalid:
+ ret = 0;
+ if (check_submounts_and_drop(entry) != 0)
+ ret = 1;
+ goto out;
}
static int invalid_nodeid(u64 nodeid)
@@ -267,26 +279,6 @@ int fuse_valid_type(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
-/*
- * Add a directory inode to a dentry, ensuring that no other dentry
- * refers to this inode. Called with fc->inst_mutex.
- */
-static struct dentry *fuse_d_add_directory(struct dentry *entry,
- struct inode *inode)
-{
- struct dentry *alias = d_find_alias(inode);
- if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
- /* This tries to shrink the subtree below alias */
- fuse_invalidate_entry(alias);
- dput(alias);
- if (!hlist_empty(&inode->i_dentry))
- return ERR_PTR(-EBUSY);
- } else {
- dput(alias);
- }
- return d_splice_alias(inode, entry);
-}
-
int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
@@ -345,6 +337,24 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
return err;
}
+static struct dentry *fuse_materialise_dentry(struct dentry *dentry,
+ struct inode *inode)
+{
+ struct dentry *newent;
+
+ if (inode && S_ISDIR(inode->i_mode)) {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ mutex_lock(&fc->inst_mutex);
+ newent = d_materialise_unique(dentry, inode);
+ mutex_unlock(&fc->inst_mutex);
+ } else {
+ newent = d_materialise_unique(dentry, inode);
+ }
+
+ return newent;
+}
+
static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
unsigned int flags)
{
@@ -352,7 +362,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
struct fuse_entry_out outarg;
struct inode *inode;
struct dentry *newent;
- struct fuse_conn *fc = get_fuse_conn(dir);
bool outarg_valid = true;
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
@@ -368,16 +377,10 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
if (inode && get_node_id(inode) == FUSE_ROOT_ID)
goto out_iput;
- if (inode && S_ISDIR(inode->i_mode)) {
- mutex_lock(&fc->inst_mutex);
- newent = fuse_d_add_directory(entry, inode);
- mutex_unlock(&fc->inst_mutex);
- err = PTR_ERR(newent);
- if (IS_ERR(newent))
- goto out_iput;
- } else {
- newent = d_splice_alias(inode, entry);
- }
+ newent = fuse_materialise_dentry(entry, inode);
+ err = PTR_ERR(newent);
+ if (IS_ERR(newent))
+ goto out_err;
entry = newent ? newent : entry;
if (outarg_valid)
@@ -1174,6 +1177,8 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
return -EIO;
if (reclen > nbytes)
break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
if (!dir_emit(ctx, dirent->name, dirent->namelen,
dirent->ino, dirent->type))
@@ -1223,30 +1228,46 @@ static int fuse_direntplus_link(struct file *file,
if (name.name[1] == '.' && name.len == 2)
return 0;
}
+
+ if (invalid_nodeid(o->nodeid))
+ return -EIO;
+ if (!fuse_valid_type(o->attr.mode))
+ return -EIO;
+
fc = get_fuse_conn(dir);
name.hash = full_name_hash(name.name, name.len);
dentry = d_lookup(parent, &name);
- if (dentry && dentry->d_inode) {
+ if (dentry) {
inode = dentry->d_inode;
- if (get_node_id(inode) == o->nodeid) {
+ if (!inode) {
+ d_drop(dentry);
+ } else if (get_node_id(inode) != o->nodeid ||
+ ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ err = d_invalidate(dentry);
+ if (err)
+ goto out;
+ } else if (is_bad_inode(inode)) {
+ err = -EIO;
+ goto out;
+ } else {
struct fuse_inode *fi;
fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
fi->nlookup++;
spin_unlock(&fc->lock);
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+
/*
* The other branch to 'found' comes via fuse_iget()
* which bumps nlookup inside
*/
goto found;
}
- err = d_invalidate(dentry);
- if (err)
- goto out;
dput(dentry);
- dentry = NULL;
}
dentry = d_alloc(parent, &name);
@@ -1259,25 +1280,22 @@ static int fuse_direntplus_link(struct file *file,
if (!inode)
goto out;
- alias = d_materialise_unique(dentry, inode);
+ alias = fuse_materialise_dentry(dentry, inode);
err = PTR_ERR(alias);
if (IS_ERR(alias))
goto out;
+
if (alias) {
dput(dentry);
dentry = alias;
}
found:
- fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
- attr_version);
-
fuse_change_entry_timeout(dentry, o);
err = 0;
out:
- if (dentry)
- dput(dentry);
+ dput(dentry);
return err;
}
@@ -1299,6 +1317,8 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
return -EIO;
if (reclen > nbytes)
break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
if (!over) {
/* We fill entries into dstbuf only as much as
@@ -1569,6 +1589,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_req *req;
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1596,8 +1617,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
if (IS_ERR(req))
return PTR_ERR(req);
- if (is_truncate)
+ if (is_truncate) {
fuse_set_nowrite(inode);
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ }
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
@@ -1659,12 +1682,14 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
invalidate_inode_pages2(inode->i_mapping);
}
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
return 0;
error:
if (is_truncate)
fuse_release_nowrite(inode);
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
return err;
}
@@ -1728,6 +1753,8 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
+ if (!err)
+ fuse_invalidate_attr(inode);
return err;
}
@@ -1857,6 +1884,8 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
+ if (!err)
+ fuse_invalidate_attr(inode);
return err;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5c121fe19c5f..d409deafc67b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -629,7 +629,8 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
- if (attr_ver == fi->attr_version && size < inode->i_size) {
+ if (attr_ver == fi->attr_version && size < inode->i_size &&
+ !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = ++fc->attr_version;
i_size_write(inode, size);
}
@@ -1032,12 +1033,16 @@ static ssize_t fuse_perform_write(struct file *file,
{
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
ssize_t res = 0;
if (is_bad_inode(inode))
return -EIO;
+ if (inode->i_size < pos + iov_iter_count(ii))
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
do {
struct fuse_req *req;
ssize_t count;
@@ -1073,6 +1078,7 @@ static ssize_t fuse_perform_write(struct file *file,
if (res > 0)
fuse_write_update_size(inode, pos);
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
fuse_invalidate_attr(inode);
return res > 0 ? res : err;
@@ -1529,7 +1535,6 @@ static int fuse_writepage_locked(struct page *page)
inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
- end_page_writeback(page);
spin_lock(&fc->lock);
list_add(&req->writepages_entry, &fi->writepages);
@@ -1537,6 +1542,8 @@ static int fuse_writepage_locked(struct page *page)
fuse_flush_writepages(inode);
spin_unlock(&fc->lock);
+ end_page_writeback(page);
+
return 0;
err_free:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fde7249a3a96..5ced199b50bb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -115,6 +115,8 @@ struct fuse_inode {
enum {
/** Advise readdirplus */
FUSE_I_ADVISE_RDPLUS,
+ /** An operation changing file size is in progress */
+ FUSE_I_SIZE_UNSTABLE,
};
struct fuse_conn;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b578598c6ac..e0fe703ee3d6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -201,7 +201,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
struct timespec old_mtime;
spin_lock(&fc->lock);
- if (attr_version != 0 && fi->attr_version > attr_version) {
+ if ((attr_version != 0 && fi->attr_version > attr_version) ||
+ test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fc->lock);
return;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ee48ad37d9c0..1f7d8057ea68 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -122,14 +122,13 @@ out:
}
/**
- * gfs2_writeback_writepage - Write page for writeback mappings
+ * gfs2_writepage - Write page for writeback mappings
* @page: The page
* @wbc: The writeback control
*
*/
-static int gfs2_writeback_writepage(struct page *page,
- struct writeback_control *wbc)
+static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
@@ -141,32 +140,6 @@ static int gfs2_writeback_writepage(struct page *page,
}
/**
- * gfs2_ordered_writepage - Write page for ordered data files
- * @page: The page to write
- * @wbc: The writeback control
- *
- */
-
-static int gfs2_ordered_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
- int ret;
-
- ret = gfs2_writepage_common(page, wbc);
- if (ret <= 0)
- return ret;
-
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
- }
- gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
- return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-}
-
-/**
* __gfs2_jdata_writepage - The core of jdata writepage
* @page: The page to write
* @wbc: The writeback control
@@ -842,6 +815,8 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
unsigned int to = from + len;
int ret;
+ struct gfs2_trans *tr = current->journal_info;
+ BUG_ON(!tr);
BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
@@ -852,8 +827,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
goto failed;
}
- gfs2_trans_add_meta(ip->i_gl, dibh);
-
if (gfs2_is_stuffed(ip))
return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
@@ -861,6 +834,11 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
gfs2_page_add_databufs(ip, page, from, to);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (tr->tr_num_buf_new)
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ else
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
@@ -1107,7 +1085,7 @@ cannot_release:
}
static const struct address_space_operations gfs2_writeback_aops = {
- .writepage = gfs2_writeback_writepage,
+ .writepage = gfs2_writepage,
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readpages = gfs2_readpages,
@@ -1123,7 +1101,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
};
static const struct address_space_operations gfs2_ordered_aops = {
- .writepage = gfs2_ordered_writepage,
+ .writepage = gfs2_writepage,
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readpages = gfs2_readpages,
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index f2448ab2aac5..d3a5d4e29ba5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -93,12 +93,9 @@ invalid_gunlock:
if (!had_lock)
gfs2_glock_dq_uninit(&d_gh);
invalid:
- if (inode && S_ISDIR(inode->i_mode)) {
- if (have_submounts(dentry))
- goto valid;
- shrink_dcache_parent(dentry);
- }
- d_drop(dentry);
+ if (check_submounts_and_drop(dentry) != 0)
+ goto valid;
+
dput(parent);
return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 0cb4c1557f20..2e5fc268d324 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1859,7 +1859,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- ht = kzalloc(size, GFP_NOFS);
+ ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
if (ht == NULL)
ht = vzalloc(size);
if (!ht)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 72c3866a7320..0621b46d474d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -650,7 +650,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
+ int sync_state = inode->i_state & I_DIRTY;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;
@@ -660,6 +660,8 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret1;
}
+ if (!gfs2_is_jdata(ip))
+ sync_state &= ~I_DIRTY_PAGES;
if (datasync)
sync_state &= ~I_DIRTY_SYNC;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9435384562a2..722329cac98f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1411,7 +1411,6 @@ __acquires(&lru_lock)
if (demote_ok(gl))
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
- smp_mb__after_clear_bit();
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put_nolock(gl);
spin_unlock(&gl->gl_spin);
@@ -1488,7 +1487,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
rcu_read_lock();
hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
- if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
+ if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref))
examiner(gl);
}
rcu_read_unlock();
@@ -1508,18 +1507,17 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
* thaw_glock - thaw out a glock which has an unprocessed reply waiting
* @gl: The glock to thaw
*
- * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
- * so this has to result in the ref count being dropped by one.
*/
static void thaw_glock(struct gfs2_glock *gl)
{
if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
- return;
+ goto out;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
- gfs2_glock_hold(gl);
- if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) {
+out:
gfs2_glock_put(gl);
+ }
}
/**
@@ -1536,7 +1534,6 @@ static void clear_glock(struct gfs2_glock *gl)
if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
spin_unlock(&gl->gl_spin);
- gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
}
@@ -1838,14 +1835,14 @@ int __init gfs2_glock_init(void)
glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZABLE, 0);
- if (IS_ERR(glock_workqueue))
- return PTR_ERR(glock_workqueue);
+ if (!glock_workqueue)
+ return -ENOMEM;
gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
WQ_MEM_RECLAIM | WQ_FREEZABLE,
0);
- if (IS_ERR(gfs2_delete_workqueue)) {
+ if (!gfs2_delete_workqueue) {
destroy_workqueue(glock_workqueue);
- return PTR_ERR(gfs2_delete_workqueue);
+ return -ENOMEM;
}
register_shrinker(&glock_shrinker);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 5f2e5224c51c..e2e0a90396e7 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -47,7 +47,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
* None of the buffers should be dirty, locked, or pinned.
*/
-static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
+static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
+ unsigned int nr_revokes)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct list_head *head = &gl->gl_ail_list;
@@ -57,7 +58,9 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
+ list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) {
+ if (nr_revokes == 0)
+ break;
bh = bd->bd_bh;
if (bh->b_state & b_state) {
if (fsync)
@@ -65,6 +68,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
gfs2_ail_error(gl, bh);
}
gfs2_trans_add_revoke(sdp, bd);
+ nr_revokes--;
}
GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
spin_unlock(&sdp->sd_ail_lock);
@@ -91,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
WARN_ON_ONCE(current->journal_info);
current->journal_info = &tr;
- __gfs2_ail_flush(gl, 0);
+ __gfs2_ail_flush(gl, 0, tr.tr_revokes);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
@@ -101,15 +105,19 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
unsigned int revokes = atomic_read(&gl->gl_ail_count);
+ unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
int ret;
if (!revokes)
return;
- ret = gfs2_trans_begin(sdp, 0, revokes);
+ while (revokes > max_revokes)
+ max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+
+ ret = gfs2_trans_begin(sdp, 0, max_revokes);
if (ret)
return;
- __gfs2_ail_flush(gl, fsync);
+ __gfs2_ail_flush(gl, fsync, max_revokes);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index bbb2715171cd..64915eeae5a7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -594,7 +594,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
}
gfs2_glock_dq_uninit(ghs);
if (IS_ERR(d))
- return PTR_RET(d);
+ return PTR_ERR(d);
return error;
} else if (error != -ENOENT) {
goto fail_gunlock;
@@ -1750,6 +1750,10 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
struct gfs2_holder gh;
int ret;
+ /* For selinux during lookup */
+ if (gfs2_glock_is_locked_by_me(ip->i_gl))
+ return generic_getxattr(dentry, name, data, size);
+
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
ret = gfs2_glock_nq(&gh);
if (ret == 0) {
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 17c5b5d7dc88..010b9fb9fec6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -579,6 +579,24 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
return error;
}
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+
+static void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+ struct address_space *mapping = gfs2_glock2aspace(gl);
+ int error;
+
+ filemap_fdatawrite(mapping);
+ error = filemap_fdatawait(mapping);
+
+ if (error)
+ gfs2_io_error(gl->gl_sbd);
+}
+
static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index e04d0e09ee7b..7b0f5043cf24 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -155,7 +155,7 @@ static int __init init_gfs2_fs(void)
goto fail_wq;
gfs2_control_wq = alloc_workqueue("gfs2_control",
- WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
+ WQ_UNBOUND | WQ_FREEZABLE, 0);
if (!gfs2_control_wq)
goto fail_recovery;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0da390686c08..932415050540 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -98,24 +98,6 @@ const struct address_space_operations gfs2_meta_aops = {
};
/**
- * gfs2_meta_sync - Sync all buffers associated with a glock
- * @gl: The glock
- *
- */
-
-void gfs2_meta_sync(struct gfs2_glock *gl)
-{
- struct address_space *mapping = gfs2_glock2aspace(gl);
- int error;
-
- filemap_fdatawrite(mapping);
- error = filemap_fdatawait(mapping);
-
- if (error)
- gfs2_io_error(gl->gl_sbd);
-}
-
-/**
* gfs2_getbuf - Get a buffer with a given address space
* @gl: the glock
* @blkno: the block number (filesystem scope)
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 0d4c843b6f8e..4823b934208a 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -48,21 +48,17 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
return inode->i_sb->s_fs_info;
}
-void gfs2_meta_sync(struct gfs2_glock *gl);
-
-struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
-int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
- int flags, struct buffer_head **bhp);
-int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
-struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
-
-void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
- int meta);
-
-void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
-
-int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
- struct buffer_head **bhp);
+extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+ struct buffer_head **bhp);
+extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
+ int create);
+extern void gfs2_remove_from_journal(struct buffer_head *bh,
+ struct gfs2_trans *tr, int meta);
+extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+ struct buffer_head **bhp);
static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
struct buffer_head **bhp)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 0262c190b6f9..19ff5e8c285c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -646,6 +646,48 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
return error;
}
+/**
+ * check_journal_clean - Make sure a journal is clean for a spectator mount
+ * @sdp: The GFS2 superblock
+ * @jd: The journal descriptor
+ *
+ * Returns: 0 if the journal is clean or locked, else an error
+ */
+static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
+{
+ int error;
+ struct gfs2_holder j_gh;
+ struct gfs2_log_header_host head;
+ struct gfs2_inode *ip;
+
+ ip = GFS2_I(jd->jd_inode);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP |
+ GL_EXACT | GL_NOCACHE, &j_gh);
+ if (error) {
+ fs_err(sdp, "Error locking journal for spectator mount.\n");
+ return -EPERM;
+ }
+ error = gfs2_jdesc_check(jd);
+ if (error) {
+ fs_err(sdp, "Error checking journal for spectator mount.\n");
+ goto out_unlock;
+ }
+ error = gfs2_find_jhead(jd, &head);
+ if (error) {
+ fs_err(sdp, "Error parsing journal for spectator mount.\n");
+ goto out_unlock;
+ }
+ if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+ error = -EPERM;
+ fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter "
+ "must not be a spectator.\n", jd->jd_jid);
+ }
+
+out_unlock:
+ gfs2_glock_dq_uninit(&j_gh);
+ return error;
+}
+
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
struct inode *master = sdp->sd_master_dir->d_inode;
@@ -732,8 +774,15 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (sdp->sd_lockstruct.ls_first) {
unsigned int x;
for (x = 0; x < sdp->sd_journals; x++) {
- error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
- true);
+ struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x);
+
+ if (sdp->sd_args.ar_spectator) {
+ error = check_journal_clean(sdp, jd);
+ if (error)
+ goto fail_jinode_gh;
+ continue;
+ }
+ error = gfs2_recover_journal(jd, true);
if (error) {
fs_err(sdp, "error recovering journal %u: %d\n",
x, error);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index cddb05217512..25437280a207 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -361,6 +361,13 @@ retry:
return 0;
}
+static int hostfs_file_release(struct inode *inode, struct file *file)
+{
+ filemap_write_and_wait(inode->i_mapping);
+
+ return 0;
+}
+
int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
@@ -386,7 +393,7 @@ static const struct file_operations hostfs_file_fops = {
.write = do_sync_write,
.mmap = generic_file_mmap,
.open = hostfs_file_open,
- .release = NULL,
+ .release = hostfs_file_release,
.fsync = hostfs_fsync,
};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a3f868ae3fd4..d19b30ababf1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -463,6 +463,14 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
return inode;
}
+/*
+ * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
+ * be taken from reclaim -- unlike regular filesystems. This needs an
+ * annotation because huge_pmd_share() does an allocation under
+ * i_mmap_mutex.
+ */
+struct lock_class_key hugetlbfs_i_mmap_mutex_key;
+
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct inode *dir,
umode_t mode, dev_t dev)
@@ -474,6 +482,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct hugetlbfs_inode_info *info;
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
+ lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
+ &hugetlbfs_i_mmap_mutex_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -916,14 +926,8 @@ static int get_hstate_idx(int page_size_log)
return h - hstates;
}
-static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen)
-{
- return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
- dentry->d_name.name);
-}
-
static struct dentry_operations anon_ops = {
- .d_dname = hugetlb_dname
+ .d_dname = simple_dname
};
/*
diff --git a/fs/inode.c b/fs/inode.c
index d6dfb09c8280..93a0625b46e4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1525,7 +1525,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
* This function automatically handles read only file systems and media,
* as well as the "noatime" flag and inode specific "noatime" markers.
*/
-void touch_atime(struct path *path)
+void touch_atime(const struct path *path)
{
struct vfsmount *mnt = path->mnt;
struct inode *inode = path->dentry->d_inode;
diff --git a/fs/internal.h b/fs/internal.h
index 7c5f01cf619d..d20893795526 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -126,6 +126,7 @@ extern int invalidate_inodes(struct super_block *, bool);
* dcache.c
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
+extern int d_set_mounted(struct dentry *dentry);
/*
* read_write.c
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index c348d6d88624..e5d408a7ea4a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -117,8 +117,8 @@ static void destroy_inodecache(void)
static int isofs_remount(struct super_block *sb, int *flags, char *data)
{
- /* we probably want a lot more here */
- *flags |= MS_RDONLY;
+ if (!(*flags & MS_RDONLY))
+ return -EROFS;
return 0;
}
@@ -763,15 +763,6 @@ root_found:
*/
s->s_maxbytes = 0x80000000000LL;
- /*
- * The CDROM is read-only, has no nodes (devices) on it, and since
- * all of the files appear to be owned by root, we really do not want
- * to allow suid. (suid or devices will not show up unless we have
- * Rock Ridge extensions)
- */
-
- s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */;
-
/* Set this for reference. Its not currently used except on write
which we don't have .. */
@@ -1530,6 +1521,9 @@ struct inode *isofs_iget(struct super_block *sb,
static struct dentry *isofs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
+ /* We don't support read-write mounts */
+ if (!(flags & MS_RDONLY))
+ return ERR_PTR(-EACCES);
return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 11bb11f48b3a..bb217dcb41af 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -340,13 +340,13 @@ void journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
trace_jbd_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD: starting commit of transaction %d\n",
commit_transaction->t_tid);
spin_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
trace_jbd_commit_locking(journal, commit_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 6510d6355729..2d04f9afafd7 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -90,6 +90,24 @@ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
static const char *journal_dev_name(journal_t *journal, char *buffer);
+#ifdef CONFIG_JBD_DEBUG
+void __jbd_debug(int level, const char *file, const char *func,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (level > journal_enable_debug)
+ return;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ va_end(args);
+}
+EXPORT_SYMBOL(__jbd_debug);
+#endif
+
/*
* Helper function used to manage commit timeouts
*/
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 559bec1a37b4..cf2fc0594063 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -343,14 +343,14 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
+ __be32 seq;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
- sequence = cpu_to_be32(sequence);
+ seq = cpu_to_be32(sequence);
addr = kmap_atomic(page);
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
bh->b_size);
kunmap_atomic(addr);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 02c7ad9d7a41..52032647dd4a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -130,9 +130,10 @@ int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
}
-static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
{
- __u32 csum, old_csum;
+ __u32 csum;
+ __be32 old_csum;
old_csum = sb->s_checksum;
sb->s_checksum = 0;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d4851464b57e..3929c50428b1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -178,7 +178,8 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
void *buf)
{
struct jbd2_journal_block_tail *tail;
- __u32 provided, calculated;
+ __be32 provided;
+ __u32 calculated;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
@@ -190,8 +191,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
tail->t_checksum = provided;
- provided = be32_to_cpu(provided);
- return provided == calculated;
+ return provided == cpu_to_be32(calculated);
}
/*
@@ -381,7 +381,8 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
{
struct commit_header *h;
- __u32 provided, calculated;
+ __be32 provided;
+ __u32 calculated;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
@@ -392,21 +393,20 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
h->h_chksum[0] = provided;
- provided = be32_to_cpu(provided);
- return provided == calculated;
+ return provided == cpu_to_be32(calculated);
}
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
__u32 csum32;
+ __be32 seq;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
- sequence = cpu_to_be32(sequence);
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
- sizeof(sequence));
+ seq = cpu_to_be32(sequence);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
return tag->t_checksum == cpu_to_be16(csum32);
@@ -808,7 +808,8 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
void *buf)
{
struct jbd2_journal_revoke_tail *tail;
- __u32 provided, calculated;
+ __be32 provided;
+ __u32 calculated;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
@@ -820,8 +821,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
tail->r_checksum = provided;
- provided = be32_to_cpu(provided);
- return provided == calculated;
+ return provided == cpu_to_be32(calculated);
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 8743ba9c6742..984c2bbf4f61 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3047,6 +3047,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
dir_index = (u32) ctx->pos;
+ /*
+ * NFSv4 reserves cookies 1 and 2 for . and .. so the value
+ * we return to the vfs is one greater than the one we use
+ * internally.
+ */
+ if (dir_index)
+ dir_index--;
+
if (dir_index > 1) {
struct dir_table_slot dirtab_slot;
@@ -3086,7 +3094,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
if (p->header.flag & BT_INTERNAL) {
jfs_err("jfs_readdir: bad index table");
DT_PUTPAGE(mp);
- ctx->pos = -1;
+ ctx->pos = DIREND;
return 0;
}
} else {
@@ -3094,14 +3102,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
/*
* self "."
*/
- ctx->pos = 0;
+ ctx->pos = 1;
if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
return 0;
}
/*
* parent ".."
*/
- ctx->pos = 1;
+ ctx->pos = 2;
if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
return 0;
@@ -3122,22 +3130,23 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
/*
* Legacy filesystem - OS/2 & Linux JFS < 0.3.6
*
- * pn = index = 0: First entry "."
- * pn = 0; index = 1: Second entry ".."
+ * pn = 0; index = 1: First entry "."
+ * pn = 0; index = 2: Second entry ".."
* pn > 0: Real entries, pn=1 -> leftmost page
* pn = index = -1: No more entries
*/
dtpos = ctx->pos;
- if (dtpos == 0) {
+ if (dtpos < 2) {
/* build "." entry */
+ ctx->pos = 1;
if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
return 0;
- dtoffset->index = 1;
+ dtoffset->index = 2;
ctx->pos = dtpos;
}
if (dtoffset->pn == 0) {
- if (dtoffset->index == 1) {
+ if (dtoffset->index == 2) {
/* build ".." entry */
if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
return 0;
@@ -3228,6 +3237,12 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
}
jfs_dirent->position = unique_pos++;
}
+ /*
+ * We add 1 to the index because we may
+ * use a value of 2 internally, and NFSv4
+ * doesn't like that.
+ */
+ jfs_dirent->position++;
} else {
jfs_dirent->position = dtpos;
len = min(d_namleft, DTLHDRDATALEN_LEGACY);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 01bfe7662751..41e491b8e5d7 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -64,12 +64,17 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
nlm_init->protocol, nlm_version,
nlm_init->hostname, nlm_init->noresvport,
nlm_init->net);
- if (host == NULL) {
- lockd_down(nlm_init->net);
- return ERR_PTR(-ENOLCK);
- }
+ if (host == NULL)
+ goto out_nohost;
+ if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
+ goto out_nobind;
return host;
+out_nobind:
+ nlmclnt_release_host(host);
+out_nohost:
+ lockd_down(nlm_init->net);
+ return ERR_PTR(-ENOLCK);
}
EXPORT_SYMBOL_GPL(nlmclnt_init);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 9760ecb9b60f..acd394716349 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -125,14 +125,15 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
{
struct nlm_args *argp = &req->a_args;
struct nlm_lock *lock = &argp->lock;
+ char *nodename = req->a_host->h_rpcclnt->cl_nodename;
nlmclnt_next_cookie(&argp->cookie);
memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
- lock->caller = utsname()->nodename;
+ lock->caller = nodename;
lock->oh.data = req->a_owner;
lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
(unsigned int)fl->fl_u.nfs_fl.owner->pid,
- utsname()->nodename);
+ nodename);
lock->svid = fl->fl_u.nfs_fl.owner->pid;
lock->fl.fl_start = fl->fl_start;
lock->fl.fl_end = fl->fl_end;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 067778b0ccc9..e066a3902973 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -951,6 +951,7 @@ nlmsvc_retry_blocked(void)
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
struct nlm_block *block;
+ spin_lock(&nlm_blocked_lock);
while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
@@ -960,6 +961,7 @@ nlmsvc_retry_blocked(void)
timeout = block->b_when - jiffies;
break;
}
+ spin_unlock(&nlm_blocked_lock);
dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
block, block->b_when);
@@ -969,7 +971,9 @@ nlmsvc_retry_blocked(void)
retry_deferred_block(block);
} else
nlmsvc_grant_blocked(block);
+ spin_lock(&nlm_blocked_lock);
}
+ spin_unlock(&nlm_blocked_lock);
return timeout;
}
diff --git a/fs/namei.c b/fs/namei.c
index 8b61d103a8a7..56e4f4d537d0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -508,56 +508,75 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
{
struct fs_struct *fs = current->fs;
struct dentry *parent = nd->path.dentry;
- int want_root = 0;
BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- want_root = 1;
- spin_lock(&fs->lock);
- if (nd->root.mnt != fs->root.mnt ||
- nd->root.dentry != fs->root.dentry)
- goto err_root;
- }
- spin_lock(&parent->d_lock);
+
+ /*
+ * Get a reference to the parent first: we're
+ * going to make "path_put(nd->path)" valid in
+ * non-RCU context for "terminate_walk()".
+ *
+ * If this doesn't work, return immediately with
+ * RCU walking still active (and then we will do
+ * the RCU walk cleanup in terminate_walk()).
+ */
+ if (!lockref_get_not_dead(&parent->d_lockref))
+ return -ECHILD;
+
+ /*
+ * After the mntget(), we terminate_walk() will do
+ * the right thing for non-RCU mode, and all our
+ * subsequent exit cases should unlock_rcu_walk()
+ * before returning.
+ */
+ mntget(nd->path.mnt);
+ nd->flags &= ~LOOKUP_RCU;
+
+ /*
+ * For a negative lookup, the lookup sequence point is the parents
+ * sequence point, and it only needs to revalidate the parent dentry.
+ *
+ * For a positive lookup, we need to move both the parent and the
+ * dentry from the RCU domain to be properly refcounted. And the
+ * sequence number in the dentry validates *both* dentry counters,
+ * since we checked the sequence number of the parent after we got
+ * the child sequence number. So we know the parent must still
+ * be valid if the child sequence number is still valid.
+ */
if (!dentry) {
- if (!__d_rcu_to_refcount(parent, nd->seq))
- goto err_parent;
+ if (read_seqcount_retry(&parent->d_seq, nd->seq))
+ goto out;
BUG_ON(nd->inode != parent->d_inode);
} else {
- if (dentry->d_parent != parent)
- goto err_parent;
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err_child;
- /*
- * If the sequence check on the child dentry passed, then
- * the child has not been removed from its parent. This
- * means the parent dentry must be valid and able to take
- * a reference at this point.
- */
- BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
- BUG_ON(!parent->d_count);
- parent->d_count++;
- spin_unlock(&dentry->d_lock);
+ if (!lockref_get_not_dead(&dentry->d_lockref))
+ goto out;
+ if (read_seqcount_retry(&dentry->d_seq, nd->seq))
+ goto drop_dentry;
}
- spin_unlock(&parent->d_lock);
- if (want_root) {
+
+ /*
+ * Sequence counts matched. Now make sure that the root is
+ * still valid and get it if required.
+ */
+ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+ spin_lock(&fs->lock);
+ if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
+ goto unlock_and_drop_dentry;
path_get(&nd->root);
spin_unlock(&fs->lock);
}
- mntget(nd->path.mnt);
unlock_rcu_walk();
- nd->flags &= ~LOOKUP_RCU;
return 0;
-err_child:
- spin_unlock(&dentry->d_lock);
-err_parent:
- spin_unlock(&parent->d_lock);
-err_root:
- if (want_root)
- spin_unlock(&fs->lock);
+unlock_and_drop_dentry:
+ spin_unlock(&fs->lock);
+drop_dentry:
+ unlock_rcu_walk();
+ dput(dentry);
+ return -ECHILD;
+out:
+ unlock_rcu_walk();
return -ECHILD;
}
@@ -585,14 +604,16 @@ static int complete_walk(struct nameidata *nd)
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
- spin_lock(&dentry->d_lock);
- if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
- spin_unlock(&dentry->d_lock);
+
+ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
unlock_rcu_walk();
return -ECHILD;
}
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
+ if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
+ unlock_rcu_walk();
+ dput(dentry);
+ return -ECHILD;
+ }
mntget(nd->path.mnt);
unlock_rcu_walk();
}
@@ -2184,6 +2205,188 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
return s;
}
+/**
+ * umount_lookup_last - look up last component for umount
+ * @nd: pathwalk nameidata - currently pointing at parent directory of "last"
+ * @path: pointer to container for result
+ *
+ * This is a special lookup_last function just for umount. In this case, we
+ * need to resolve the path without doing any revalidation.
+ *
+ * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
+ * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
+ * in almost all cases, this lookup will be served out of the dcache. The only
+ * cases where it won't are if nd->last refers to a symlink or the path is
+ * bogus and it doesn't exist.
+ *
+ * Returns:
+ * -error: if there was an error during lookup. This includes -ENOENT if the
+ * lookup found a negative dentry. The nd->path reference will also be
+ * put in this case.
+ *
+ * 0: if we successfully resolved nd->path and found it to not to be a
+ * symlink that needs to be followed. "path" will also be populated.
+ * The nd->path reference will also be put.
+ *
+ * 1: if we successfully resolved nd->last and found it to be a symlink
+ * that needs to be followed. "path" will be populated with the path
+ * to the link, and nd->path will *not* be put.
+ */
+static int
+umount_lookup_last(struct nameidata *nd, struct path *path)
+{
+ int error = 0;
+ struct dentry *dentry;
+ struct dentry *dir = nd->path.dentry;
+
+ if (unlikely(nd->flags & LOOKUP_RCU)) {
+ WARN_ON_ONCE(1);
+ error = -ECHILD;
+ goto error_check;
+ }
+
+ nd->flags &= ~LOOKUP_PARENT;
+
+ if (unlikely(nd->last_type != LAST_NORM)) {
+ error = handle_dots(nd, nd->last_type);
+ if (!error)
+ dentry = dget(nd->path.dentry);
+ goto error_check;
+ }
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ dentry = d_lookup(dir, &nd->last);
+ if (!dentry) {
+ /*
+ * No cached dentry. Mounted dentries are pinned in the cache,
+ * so that means that this dentry is probably a symlink or the
+ * path doesn't actually point to a mounted dentry.
+ */
+ dentry = d_alloc(dir, &nd->last);
+ if (!dentry) {
+ error = -ENOMEM;
+ } else {
+ dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
+ }
+ }
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+error_check:
+ if (!error) {
+ if (!dentry->d_inode) {
+ error = -ENOENT;
+ dput(dentry);
+ } else {
+ path->dentry = dentry;
+ path->mnt = mntget(nd->path.mnt);
+ if (should_follow_link(dentry->d_inode,
+ nd->flags & LOOKUP_FOLLOW))
+ return 1;
+ follow_mount(path);
+ }
+ }
+ terminate_walk(nd);
+ return error;
+}
+
+/**
+ * path_umountat - look up a path to be umounted
+ * @dfd: directory file descriptor to start walk from
+ * @name: full pathname to walk
+ * @flags: lookup flags
+ * @nd: pathwalk nameidata
+ *
+ * Look up the given name, but don't attempt to revalidate the last component.
+ * Returns 0 and "path" will be valid on success; Retuns error otherwise.
+ */
+static int
+path_umountat(int dfd, const char *name, struct path *path, unsigned int flags)
+{
+ struct file *base = NULL;
+ struct nameidata nd;
+ int err;
+
+ err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
+ if (unlikely(err))
+ return err;
+
+ current->total_link_count = 0;
+ err = link_path_walk(name, &nd);
+ if (err)
+ goto out;
+
+ /* If we're in rcuwalk, drop out of it to handle last component */
+ if (nd.flags & LOOKUP_RCU) {
+ err = unlazy_walk(&nd, NULL);
+ if (err) {
+ terminate_walk(&nd);
+ goto out;
+ }
+ }
+
+ err = umount_lookup_last(&nd, path);
+ while (err > 0) {
+ void *cookie;
+ struct path link = *path;
+ err = may_follow_link(&link, &nd);
+ if (unlikely(err))
+ break;
+ nd.flags |= LOOKUP_PARENT;
+ err = follow_link(&link, &nd, &cookie);
+ if (err)
+ break;
+ err = umount_lookup_last(&nd, path);
+ put_link(&nd, &link, cookie);
+ }
+out:
+ if (base)
+ fput(base);
+
+ if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
+ path_put(&nd.root);
+
+ return err;
+}
+
+/**
+ * user_path_umountat - lookup a path from userland in order to umount it
+ * @dfd: directory file descriptor
+ * @name: pathname from userland
+ * @flags: lookup flags
+ * @path: pointer to container to hold result
+ *
+ * A umount is a special case for path walking. We're not actually interested
+ * in the inode in this situation, and ESTALE errors can be a problem. We
+ * simply want track down the dentry and vfsmount attached at the mountpoint
+ * and avoid revalidating the last component.
+ *
+ * Returns 0 and populates "path" on success.
+ */
+int
+user_path_umountat(int dfd, const char __user *name, unsigned int flags,
+ struct path *path)
+{
+ struct filename *s = getname(name);
+ int error;
+
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ error = path_umountat(dfd, s->name, path, flags | LOOKUP_RCU);
+ if (unlikely(error == -ECHILD))
+ error = path_umountat(dfd, s->name, path, flags);
+ if (unlikely(error == -ESTALE))
+ error = path_umountat(dfd, s->name, path, flags | LOOKUP_REVAL);
+
+ if (likely(!error))
+ audit_inode(s, path->dentry, 0);
+
+ putname(s);
+ return error;
+}
+
/*
* It's inline, so penalty for filesystems that don't use sticky bit is
* minimal.
@@ -3327,7 +3530,7 @@ void dentry_unhash(struct dentry *dentry)
{
shrink_dcache_parent(dentry);
spin_lock(&dentry->d_lock);
- if (dentry->d_count == 1)
+ if (dentry->d_lockref.count == 1)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b1ca9ba0b0a..fc2b5226278d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -611,6 +611,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
{
struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
struct mountpoint *mp;
+ int ret;
list_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
@@ -626,14 +627,12 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
if (!mp)
return ERR_PTR(-ENOMEM);
- spin_lock(&dentry->d_lock);
- if (d_unlinked(dentry)) {
- spin_unlock(&dentry->d_lock);
+ ret = d_set_mounted(dentry);
+ if (ret) {
kfree(mp);
- return ERR_PTR(-ENOENT);
+ return ERR_PTR(ret);
}
- dentry->d_flags |= DCACHE_MOUNTED;
- spin_unlock(&dentry->d_lock);
+
mp->m_dentry = dentry;
mp->m_count = 1;
list_add(&mp->m_hash, chain);
@@ -831,6 +830,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+ /* Don't allow unprivileged users to reveal what is under a mount */
+ if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
+ mnt->mnt.mnt_flags |= MNT_LOCKED;
+
atomic_inc(&sb->s_active);
mnt->mnt.mnt_sb = sb;
mnt->mnt.mnt_root = dget(root);
@@ -1318,7 +1321,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
- retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
+ retval = user_path_umountat(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
mnt = real_mount(path.mnt);
@@ -1327,6 +1330,8 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
goto dput_and_out;
if (!check_mnt(mnt))
goto dput_and_out;
+ if (mnt->mnt.mnt_flags & MNT_LOCKED)
+ goto dput_and_out;
retval = do_umount(mnt, flags);
dput_and_out:
@@ -1349,14 +1354,11 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
#endif
-static bool mnt_ns_loop(struct path *path)
+static bool is_mnt_ns_file(struct dentry *dentry)
{
- /* Could bind mounting the mount namespace inode cause a
- * mount namespace loop?
- */
- struct inode *inode = path->dentry->d_inode;
+ /* Is this a proxy for a mount namespace? */
+ struct inode *inode = dentry->d_inode;
struct proc_ns *ei;
- struct mnt_namespace *mnt_ns;
if (!proc_ns_inode(inode))
return false;
@@ -1365,7 +1367,19 @@ static bool mnt_ns_loop(struct path *path)
if (ei->ns_ops != &mntns_operations)
return false;
- mnt_ns = ei->ns;
+ return true;
+}
+
+static bool mnt_ns_loop(struct dentry *dentry)
+{
+ /* Could bind mounting the mount namespace inode cause a
+ * mount namespace loop?
+ */
+ struct mnt_namespace *mnt_ns;
+ if (!is_mnt_ns_file(dentry))
+ return false;
+
+ mnt_ns = get_proc_ns(dentry->d_inode)->ns;
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
@@ -1374,13 +1388,17 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
{
struct mount *res, *p, *q, *r, *parent;
- if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
+ if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
+ return ERR_PTR(-EINVAL);
+
+ if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
return ERR_PTR(-EINVAL);
res = q = clone_mnt(mnt, dentry, flag);
if (IS_ERR(q))
return q;
+ q->mnt.mnt_flags &= ~MNT_LOCKED;
q->mnt_mountpoint = mnt->mnt_mountpoint;
p = mnt;
@@ -1390,7 +1408,13 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
continue;
for (s = r; s; s = next_mnt(s, r)) {
- if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) {
+ if (!(flag & CL_COPY_UNBINDABLE) &&
+ IS_MNT_UNBINDABLE(s)) {
+ s = skip_mnt_tree(s);
+ continue;
+ }
+ if (!(flag & CL_COPY_MNT_NS_FILE) &&
+ is_mnt_ns_file(s->mnt.mnt_root)) {
s = skip_mnt_tree(s);
continue;
}
@@ -1429,7 +1453,7 @@ struct vfsmount *collect_mounts(struct path *path)
CL_COPY_ALL | CL_PRIVATE);
namespace_unlock();
if (IS_ERR(tree))
- return NULL;
+ return ERR_CAST(tree);
return &tree->mnt;
}
@@ -1696,6 +1720,19 @@ static int do_change_type(struct path *path, int flag)
return err;
}
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ struct mount *child;
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, dentry))
+ continue;
+
+ if (child->mnt.mnt_flags & MNT_LOCKED)
+ return true;
+ }
+ return false;
+}
+
/*
* do loopback mount.
*/
@@ -1713,7 +1750,7 @@ static int do_loopback(struct path *path, const char *old_name,
return err;
err = -EINVAL;
- if (mnt_ns_loop(&old_path))
+ if (mnt_ns_loop(old_path.dentry))
goto out;
mp = lock_mount(path);
@@ -1731,8 +1768,11 @@ static int do_loopback(struct path *path, const char *old_name,
if (!check_mnt(parent) || !check_mnt(old))
goto out2;
+ if (!recurse && has_locked_children(old, old_path.dentry))
+ goto out2;
+
if (recurse)
- mnt = copy_tree(old, old_path.dentry, 0);
+ mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
else
mnt = clone_mnt(old, old_path.dentry, 0);
@@ -1741,6 +1781,8 @@ static int do_loopback(struct path *path, const char *old_name,
goto out2;
}
+ mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
err = graft_tree(mnt, parent, mp);
if (err) {
br_write_lock(&vfsmount_lock);
@@ -1853,6 +1895,9 @@ static int do_move_mount(struct path *path, const char *old_name)
if (!check_mnt(p) || !check_mnt(old))
goto out1;
+ if (old->mnt.mnt_flags & MNT_LOCKED)
+ goto out1;
+
err = -EINVAL;
if (old_path.dentry != old_path.mnt->mnt_root)
goto out1;
@@ -2389,7 +2434,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
namespace_lock();
/* First pass: copy the tree topology */
- copy_flags = CL_COPY_ALL | CL_EXPIRE;
+ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != mnt_ns->user_ns)
copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
@@ -2424,6 +2469,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
}
p = next_mnt(p, old);
q = next_mnt(q, new);
+ if (!q)
+ break;
+ while (p->mnt.mnt_root != q->mnt.mnt_root)
+ p = next_mnt(p, old);
}
namespace_unlock();
@@ -2630,6 +2679,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out4;
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
goto out4;
+ if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
+ goto out4;
error = -ENOENT;
if (d_unlinked(new.dentry))
goto out4;
@@ -2653,6 +2704,10 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
br_write_lock(&vfsmount_lock);
detach_mnt(new_mnt, &parent_path);
detach_mnt(root_mnt, &root_parent);
+ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
+ new_mnt->mnt.mnt_flags |= MNT_LOCKED;
+ root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+ }
/* mount old root on put_old */
attach_mnt(root_mnt, old_mnt, old_mp);
/* mount new_root on / */
@@ -2811,25 +2866,38 @@ bool current_chrooted(void)
return chrooted;
}
-void update_mnt_policy(struct user_namespace *userns)
+bool fs_fully_visible(struct file_system_type *type)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mount *mnt;
+ bool visible = false;
- down_read(&namespace_sem);
+ if (unlikely(!ns))
+ return false;
+
+ namespace_lock();
list_for_each_entry(mnt, &ns->list, mnt_list) {
- switch (mnt->mnt.mnt_sb->s_magic) {
- case SYSFS_MAGIC:
- userns->may_mount_sysfs = true;
- break;
- case PROC_SUPER_MAGIC:
- userns->may_mount_proc = true;
- break;
+ struct mount *child;
+ if (mnt->mnt.mnt_sb->s_type != type)
+ continue;
+
+ /* This mount is not fully visible if there are any child mounts
+ * that cover anything except for empty directories.
+ */
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ struct inode *inode = child->mnt_mountpoint->d_inode;
+ if (!S_ISDIR(inode->i_mode))
+ goto next;
+ if (inode->i_nlink != 2)
+ goto next;
}
- if (userns->may_mount_sysfs && userns->may_mount_proc)
- break;
+ visible = true;
+ goto found;
+ next: ;
}
- up_read(&namespace_sem);
+found:
+ namespace_unlock();
+ return visible;
}
static void *mntns_get(struct task_struct *task)
@@ -2860,8 +2928,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns)
struct path root;
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
- !nsown_capable(CAP_SYS_CHROOT) ||
- !nsown_capable(CAP_SYS_ADMIN))
+ !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
if (fs->users != 1)
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index e0bb048e9576..03192a66c143 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,9 +4,10 @@
obj-$(CONFIG_NFS_FS) += nfs.o
+CFLAGS_nfstrace.o += -I$(src)
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
direct.o pagelist.o read.o symlink.o unlink.o \
- write.o namespace.o mount_clnt.o
+ write.o namespace.o mount_clnt.o nfstrace.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -19,12 +20,14 @@ nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
obj-$(CONFIG_NFS_V4) += nfsv4.o
+CFLAGS_nfs4trace.o += -I$(src)
nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
- nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
+ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
+ dns_resolve.o nfs4trace.o
nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
-nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e6ebc4c38c81..ae2e87b95453 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -15,6 +15,7 @@
#include "internal.h"
#include "pnfs.h"
#include "nfs4session.h"
+#include "nfs4trace.h"
#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -93,6 +94,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
default:
res = htonl(NFS4ERR_RESOURCE);
}
+ trace_nfs4_recall_delegation(inode, -ntohl(res));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -301,14 +303,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
{
struct nfs4_slot *slot;
- dprintk("%s enter. slotid %d seqid %d\n",
+ dprintk("%s enter. slotid %u seqid %u\n",
__func__, args->csa_slotid, args->csa_sequenceid);
if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
return htonl(NFS4ERR_BADSLOT);
slot = tbl->slots + args->csa_slotid;
- dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
+ dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
/* Normal */
if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
@@ -318,7 +320,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
/* Replay */
if (args->csa_sequenceid == slot->seq_nr) {
- dprintk("%s seqid %d is a replay\n",
+ dprintk("%s seqid %u is a replay\n",
__func__, args->csa_sequenceid);
/* Signal process_op to set this error on next op */
if (args->csa_cachethis == 0)
@@ -462,6 +464,7 @@ out:
} else
res->csr_status = status;
+ trace_nfs4_cb_sequence(args, res, status);
dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
ntohl(status), ntohl(res->csr_status));
return status;
@@ -518,7 +521,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
if (!cps->clp) /* set in cb_sequence */
goto out;
- dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n",
+ dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n",
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
args->crsa_target_highest_slotid);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 340b1eff0267..2dceee4db076 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -501,8 +501,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
new->cl_flags = cl_init->init_flags;
- return rpc_ops->init_client(new, timeparms, ip_addr,
- authflavour);
+ return rpc_ops->init_client(new, timeparms, ip_addr);
}
spin_unlock(&nn->nfs_client_lock);
@@ -694,13 +693,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
* @clp: nfs_client to initialise
* @timeparms: timeout parameters for underlying RPC transport
* @ip_addr: IP presentation address (not used)
- * @authflavor: authentication flavor for underlying RPC transport
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr, rpc_authflavor_t authflavour)
+ const char *ip_addr)
{
int error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7ec4814e298d..ef792f29f831 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -20,6 +20,7 @@
#include "nfs4_fs.h"
#include "delegation.h"
#include "internal.h"
+#include "nfs4trace.h"
static void nfs_free_delegation(struct nfs_delegation *delegation)
{
@@ -160,6 +161,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
spin_unlock(&delegation->lock);
put_rpccred(oldcred);
rcu_read_unlock();
+ trace_nfs4_reclaim_delegation(inode, res->delegation_type);
} else {
/* We appear to have raced with a delegation return. */
spin_unlock(&delegation->lock);
@@ -344,6 +346,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
spin_lock(&inode->i_lock);
nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
spin_unlock(&inode->i_lock);
+ trace_nfs4_set_delegation(inode, res->delegation_type);
out:
spin_unlock(&clp->cl_lock);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e474ca2b2bfe..e79bc6ce828e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -43,6 +43,8 @@
#include "internal.h"
#include "fscache.h"
+#include "nfstrace.h"
+
/* #define NFS_DEBUG_VERBOSE 1 */
static int nfs_opendir(struct inode *, struct file *);
@@ -1100,7 +1102,9 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (IS_ERR(label))
goto out_error;
+ trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+ trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
if (error)
goto out_bad;
if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1135,14 +1139,13 @@ out_zap_parent:
if (inode && S_ISDIR(inode->i_mode)) {
/* Purge readdir caches. */
nfs_zap_caches(inode);
- /* If we have submounts, don't unhash ! */
- if (have_submounts(dentry))
- goto out_valid;
if (dentry->d_flags & DCACHE_DISCONNECTED)
goto out_valid;
- shrink_dcache_parent(dentry);
}
- d_drop(dentry);
+ /* If we have submounts, don't unhash ! */
+ if (check_submounts_and_drop(dentry) != 0)
+ goto out_valid;
+
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
__func__, dentry->d_parent->d_name.name,
@@ -1313,6 +1316,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
+ trace_nfs_lookup_enter(dir, dentry, flags);
nfs_block_sillyrename(parent);
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error == -ENOENT)
@@ -1339,6 +1343,7 @@ no_entry:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
+ trace_nfs_lookup_exit(dir, dentry, flags, error);
nfs4_label_free(label);
out:
nfs_free_fattr(fattr);
@@ -1393,7 +1398,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
nfs_file_set_open_context(file, ctx);
out:
- put_nfs_open_context(ctx);
return err;
}
@@ -1405,6 +1409,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
struct dentry *res;
struct iattr attr = { .ia_valid = ATTR_OPEN };
struct inode *inode;
+ unsigned int lookup_flags = 0;
int err;
/* Expect a negative dentry */
@@ -1413,6 +1418,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+ err = nfs_check_flags(open_flags);
+ if (err)
+ return err;
+
/* NFS only supports OPEN on regular files */
if ((open_flags & O_DIRECTORY)) {
if (!d_unhashed(dentry)) {
@@ -1423,6 +1432,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
*/
return -ENOENT;
}
+ lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY;
goto no_open;
}
@@ -1443,12 +1453,14 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
if (IS_ERR(ctx))
goto out;
+ trace_nfs_atomic_open_enter(dir, ctx, open_flags);
nfs_block_sillyrename(dentry->d_parent);
inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
nfs_unblock_sillyrename(dentry->d_parent);
if (IS_ERR(inode)) {
- put_nfs_open_context(ctx);
err = PTR_ERR(inode);
+ trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+ put_nfs_open_context(ctx);
switch (err) {
case -ENOENT:
d_drop(dentry);
@@ -1469,11 +1481,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
+ trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
+ put_nfs_open_context(ctx);
out:
return err;
no_open:
- res = nfs_lookup(dir, dentry, 0);
+ res = nfs_lookup(dir, dentry, lookup_flags);
err = PTR_ERR(res);
if (IS_ERR(res))
goto out;
@@ -1597,7 +1611,9 @@ int nfs_create(struct inode *dir, struct dentry *dentry,
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
+ trace_nfs_create_enter(dir, dentry, open_flags);
error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
+ trace_nfs_create_exit(dir, dentry, open_flags, error);
if (error != 0)
goto out_err;
return 0;
@@ -1625,7 +1641,9 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
+ trace_nfs_mknod_enter(dir, dentry);
status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
+ trace_nfs_mknod_exit(dir, dentry, status);
if (status != 0)
goto out_err;
return 0;
@@ -1649,7 +1667,9 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
attr.ia_valid = ATTR_MODE;
attr.ia_mode = mode | S_IFDIR;
+ trace_nfs_mkdir_enter(dir, dentry);
error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+ trace_nfs_mkdir_exit(dir, dentry, error);
if (error != 0)
goto out_err;
return 0;
@@ -1672,12 +1692,21 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
- error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
- /* Ensure the VFS deletes this inode */
- if (error == 0 && dentry->d_inode != NULL)
- clear_nlink(dentry->d_inode);
- else if (error == -ENOENT)
- nfs_dentry_handle_enoent(dentry);
+ trace_nfs_rmdir_enter(dir, dentry);
+ if (dentry->d_inode) {
+ nfs_wait_on_sillyrename(dentry);
+ error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ /* Ensure the VFS deletes this inode */
+ switch (error) {
+ case 0:
+ clear_nlink(dentry->d_inode);
+ break;
+ case -ENOENT:
+ nfs_dentry_handle_enoent(dentry);
+ }
+ } else
+ error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ trace_nfs_rmdir_exit(dir, dentry, error);
return error;
}
@@ -1705,6 +1734,7 @@ static int nfs_safe_remove(struct dentry *dentry)
goto out;
}
+ trace_nfs_remove_enter(dir, dentry);
if (inode != NULL) {
NFS_PROTO(inode)->return_delegation(inode);
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1714,6 +1744,7 @@ static int nfs_safe_remove(struct dentry *dentry)
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
if (error == -ENOENT)
nfs_dentry_handle_enoent(dentry);
+ trace_nfs_remove_exit(dir, dentry, error);
out:
return error;
}
@@ -1731,13 +1762,14 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
dir->i_ino, dentry->d_name.name);
+ trace_nfs_unlink_enter(dir, dentry);
spin_lock(&dentry->d_lock);
if (d_count(dentry) > 1) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
error = nfs_sillyrename(dir, dentry);
- return error;
+ goto out;
}
if (!d_unhashed(dentry)) {
__d_drop(dentry);
@@ -1749,6 +1781,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
} else if (need_rehash)
d_rehash(dentry);
+out:
+ trace_nfs_unlink_exit(dir, dentry, error);
return error;
}
EXPORT_SYMBOL_GPL(nfs_unlink);
@@ -1795,7 +1829,9 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
kunmap_atomic(kaddr);
+ trace_nfs_symlink_enter(dir, dentry);
error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+ trace_nfs_symlink_exit(dir, dentry, error);
if (error != 0) {
dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
dir->i_sb->s_id, dir->i_ino,
@@ -1830,6 +1866,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
dentry->d_parent->d_name.name, dentry->d_name.name);
+ trace_nfs_link_enter(inode, dir, dentry);
NFS_PROTO(inode)->return_delegation(inode);
d_drop(dentry);
@@ -1838,6 +1875,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
ihold(inode);
d_add(dentry, inode);
}
+ trace_nfs_link_exit(inode, dir, dentry, error);
return error;
}
EXPORT_SYMBOL_GPL(nfs_link);
@@ -1879,6 +1917,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
d_count(new_dentry));
+ trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);
/*
* For non-directories, check whether the target is busy and if so,
* make a copy of the dentry and then do a silly-rename. If the
@@ -1925,6 +1964,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
out:
if (rehash)
d_rehash(rehash);
+ trace_nfs_rename_exit(old_dir, old_dentry,
+ new_dir, new_dentry, error);
if (!error) {
if (new_inode != NULL)
nfs_drop_nlink(new_inode);
@@ -2174,9 +2215,11 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
struct nfs_access_entry cache;
int status;
+ trace_nfs_access_enter(inode);
+
status = nfs_access_get_cached(inode, cred, &cache);
if (status == 0)
- goto out;
+ goto out_cached;
/* Be clever: ask server to check for all possible rights */
cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
@@ -2189,13 +2232,15 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
if (!S_ISDIR(inode->i_mode))
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
}
- return status;
+ goto out;
}
nfs_access_add_cache(inode, &cache);
+out_cached:
+ if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0)
+ status = -EACCES;
out:
- if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
- return 0;
- return -EACCES;
+ trace_nfs_access_exit(inode, status);
+ return status;
}
static int nfs_open_permission_mask(int openflags)
@@ -2241,11 +2286,6 @@ int nfs_permission(struct inode *inode, int mask)
case S_IFLNK:
goto out;
case S_IFREG:
- /* NFSv4 has atomic_open... */
- if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
- && (mask & MAY_OPEN)
- && !(mask & MAY_EXEC))
- goto out;
break;
case S_IFDIR:
/*
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 94e94bd11aae..1e6bfdbc1aff 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -37,6 +37,8 @@
#include "iostat.h"
#include "fscache.h"
+#include "nfstrace.h"
+
#define NFSDBG_FACILITY NFSDBG_FILE
static const struct vm_operations_struct nfs_file_vm_ops;
@@ -294,6 +296,8 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int ret;
struct inode *inode = file_inode(file);
+ trace_nfs_fsync_enter(inode);
+
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
@@ -310,6 +314,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
end = LLONG_MAX;
} while (ret == -EAGAIN);
+ trace_nfs_fsync_exit(inode, ret);
return ret;
}
@@ -406,6 +411,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
int status;
dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
@@ -441,6 +447,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
if (status < 0)
return status;
NFS_I(mapping->host)->write_io += copied;
+
+ if (nfs_ctx_key_to_expire(ctx)) {
+ status = nfs_wb_all(mapping->host);
+ if (status < 0)
+ return status;
+ }
+
return copied;
}
@@ -637,7 +650,8 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
return 1;
ctx = nfs_file_open_context(filp);
- if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
+ if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
+ nfs_ctx_key_to_expire(ctx))
return 1;
return 0;
}
@@ -651,6 +665,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
ssize_t result;
size_t count = iov_length(iov, nr_segs);
+ result = nfs_key_timeout_notify(iocb->ki_filp, inode);
+ if (result)
+ return result;
+
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c2c4163d5683..567983d2c0eb 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -49,6 +49,7 @@
#include "internal.h"
#include "netns.h"
+#include "nfs4trace.h"
#define NFS_UINT_MAXLEN 11
@@ -63,6 +64,7 @@ struct idmap_legacy_upcalldata {
};
struct idmap {
+ struct rpc_pipe_dir_object idmap_pdo;
struct rpc_pipe *idmap_pipe;
struct idmap_legacy_upcalldata *idmap_upcall_data;
struct mutex idmap_mutex;
@@ -310,7 +312,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
if (ret < 0)
goto out_up;
- payload = rcu_dereference(rkey->payload.data);
+ payload = rcu_dereference(rkey->payload.rcudata);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
@@ -401,16 +403,23 @@ static struct key_type key_type_id_resolver_legacy = {
.request_key = nfs_idmap_legacy_upcall,
};
-static void __nfs_idmap_unregister(struct rpc_pipe *pipe)
+static void nfs_idmap_pipe_destroy(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
{
- if (pipe->dentry)
+ struct idmap *idmap = pdo->pdo_data;
+ struct rpc_pipe *pipe = idmap->idmap_pipe;
+
+ if (pipe->dentry) {
rpc_unlink(pipe->dentry);
+ pipe->dentry = NULL;
+ }
}
-static int __nfs_idmap_register(struct dentry *dir,
- struct idmap *idmap,
- struct rpc_pipe *pipe)
+static int nfs_idmap_pipe_create(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
{
+ struct idmap *idmap = pdo->pdo_data;
+ struct rpc_pipe *pipe = idmap->idmap_pipe;
struct dentry *dentry;
dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
@@ -420,36 +429,10 @@ static int __nfs_idmap_register(struct dentry *dir,
return 0;
}
-static void nfs_idmap_unregister(struct nfs_client *clp,
- struct rpc_pipe *pipe)
-{
- struct net *net = clp->cl_net;
- struct super_block *pipefs_sb;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (pipefs_sb) {
- __nfs_idmap_unregister(pipe);
- rpc_put_sb_net(net);
- }
-}
-
-static int nfs_idmap_register(struct nfs_client *clp,
- struct idmap *idmap,
- struct rpc_pipe *pipe)
-{
- struct net *net = clp->cl_net;
- struct super_block *pipefs_sb;
- int err = 0;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (pipefs_sb) {
- if (clp->cl_rpcclient->cl_dentry)
- err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
- idmap, pipe);
- rpc_put_sb_net(net);
- }
- return err;
-}
+static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
+ .create = nfs_idmap_pipe_create,
+ .destroy = nfs_idmap_pipe_destroy,
+};
int
nfs_idmap_new(struct nfs_client *clp)
@@ -462,23 +445,31 @@ nfs_idmap_new(struct nfs_client *clp)
if (idmap == NULL)
return -ENOMEM;
+ rpc_init_pipe_dir_object(&idmap->idmap_pdo,
+ &nfs_idmap_pipe_dir_object_ops,
+ idmap);
+
pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
if (IS_ERR(pipe)) {
error = PTR_ERR(pipe);
- kfree(idmap);
- return error;
- }
- error = nfs_idmap_register(clp, idmap, pipe);
- if (error) {
- rpc_destroy_pipe_data(pipe);
- kfree(idmap);
- return error;
+ goto err;
}
idmap->idmap_pipe = pipe;
mutex_init(&idmap->idmap_mutex);
+ error = rpc_add_pipe_dir_object(clp->cl_net,
+ &clp->cl_rpcclient->cl_pipedir_objects,
+ &idmap->idmap_pdo);
+ if (error)
+ goto err_destroy_pipe;
+
clp->cl_idmap = idmap;
return 0;
+err_destroy_pipe:
+ rpc_destroy_pipe_data(idmap->idmap_pipe);
+err:
+ kfree(idmap);
+ return error;
}
void
@@ -488,130 +479,26 @@ nfs_idmap_delete(struct nfs_client *clp)
if (!idmap)
return;
- nfs_idmap_unregister(clp, idmap->idmap_pipe);
- rpc_destroy_pipe_data(idmap->idmap_pipe);
clp->cl_idmap = NULL;
+ rpc_remove_pipe_dir_object(clp->cl_net,
+ &clp->cl_rpcclient->cl_pipedir_objects,
+ &idmap->idmap_pdo);
+ rpc_destroy_pipe_data(idmap->idmap_pipe);
kfree(idmap);
}
-static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
- struct super_block *sb)
-{
- int err = 0;
-
- switch (event) {
- case RPC_PIPEFS_MOUNT:
- err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
- clp->cl_idmap,
- clp->cl_idmap->idmap_pipe);
- break;
- case RPC_PIPEFS_UMOUNT:
- if (clp->cl_idmap->idmap_pipe) {
- struct dentry *parent;
-
- parent = clp->cl_idmap->idmap_pipe->dentry->d_parent;
- __nfs_idmap_unregister(clp->cl_idmap->idmap_pipe);
- /*
- * Note: This is a dirty hack. SUNRPC hook has been
- * called already but simple_rmdir() call for the
- * directory returned with error because of idmap pipe
- * inside. Thus now we have to remove this directory
- * here.
- */
- if (rpc_rmdir(parent))
- printk(KERN_ERR "NFS: %s: failed to remove "
- "clnt dir!\n", __func__);
- }
- break;
- default:
- printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__,
- event);
- return -ENOTSUPP;
- }
- return err;
-}
-
-static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *cl_dentry;
- struct nfs_client *clp;
- int err;
-
-restart:
- spin_lock(&nn->nfs_client_lock);
- list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
- /* Wait for initialisation to finish */
- if (clp->cl_cons_state == NFS_CS_INITING) {
- atomic_inc(&clp->cl_count);
- spin_unlock(&nn->nfs_client_lock);
- err = nfs_wait_client_init_complete(clp);
- nfs_put_client(clp);
- if (err)
- return NULL;
- goto restart;
- }
- /* Skip nfs_clients that failed to initialise */
- if (clp->cl_cons_state < 0)
- continue;
- smp_rmb();
- if (clp->rpc_ops != &nfs_v4_clientops)
- continue;
- cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
- if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) ||
- ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry))
- continue;
- atomic_inc(&clp->cl_count);
- spin_unlock(&nn->nfs_client_lock);
- return clp;
- }
- spin_unlock(&nn->nfs_client_lock);
- return NULL;
-}
-
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
- void *ptr)
-{
- struct super_block *sb = ptr;
- struct nfs_client *clp;
- int error = 0;
-
- if (!try_module_get(THIS_MODULE))
- return 0;
-
- while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) {
- error = __rpc_pipefs_event(clp, event, sb);
- nfs_put_client(clp);
- if (error)
- break;
- }
- module_put(THIS_MODULE);
- return error;
-}
-
-#define PIPEFS_NFS_PRIO 1
-
-static struct notifier_block nfs_idmap_block = {
- .notifier_call = rpc_pipefs_event,
- .priority = SUNRPC_PIPEFS_NFS_PRIO,
-};
-
int nfs_idmap_init(void)
{
int ret;
ret = nfs_idmap_init_keyring();
if (ret != 0)
goto out;
- ret = rpc_pipefs_notifier_register(&nfs_idmap_block);
- if (ret != 0)
- nfs_idmap_quit_keyring();
out:
return ret;
}
void nfs_idmap_quit(void)
{
- rpc_pipefs_notifier_unregister(&nfs_idmap_block);
nfs_idmap_quit_keyring();
}
@@ -849,6 +736,7 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_
if (!uid_valid(*uid))
ret = -ERANGE;
}
+ trace_nfs4_map_name_to_uid(name, namelen, id, ret);
return ret;
}
@@ -865,6 +753,7 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size
if (!gid_valid(*gid))
ret = -ERANGE;
}
+ trace_nfs4_map_group_to_gid(name, namelen, id, ret);
return ret;
}
@@ -879,6 +768,7 @@ int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf,
ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
if (ret < 0)
ret = nfs_map_numeric_to_string(id, buf, buflen);
+ trace_nfs4_map_uid_to_name(buf, ret, id, ret);
return ret;
}
int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
@@ -892,5 +782,6 @@ int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf,
ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
if (ret < 0)
ret = nfs_map_numeric_to_string(id, buf, buflen);
+ trace_nfs4_map_gid_to_group(buf, ret, id, ret);
return ret;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index af6e806044d7..87e797640828 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -38,7 +38,6 @@
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/freezer.h>
-#include <linux/crc32.h>
#include <asm/uaccess.h>
@@ -52,6 +51,8 @@
#include "nfs.h"
#include "netns.h"
+#include "nfstrace.h"
+
#define NFSDBG_FACILITY NFSDBG_VFS
#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
@@ -463,7 +464,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
unlock_new_inode(inode);
} else
nfs_refresh_inode(inode, fattr);
- nfs_setsecurity(inode, fattr, label);
dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode),
@@ -504,6 +504,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
return 0;
+ trace_nfs_setattr_enter(inode);
+
/* Write all dirty data */
if (S_ISREG(inode->i_mode)) {
nfs_inode_dio_wait(inode);
@@ -523,6 +525,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
error = nfs_refresh_inode(inode, fattr);
nfs_free_fattr(fattr);
out:
+ trace_nfs_setattr_exit(inode, error);
return error;
}
EXPORT_SYMBOL_GPL(nfs_setattr);
@@ -592,6 +595,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
int err;
+ trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
nfs_inode_dio_wait(inode);
@@ -622,6 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
}
out:
+ trace_nfs_getattr_exit(inode, err);
return err;
}
EXPORT_SYMBOL_GPL(nfs_getattr);
@@ -876,6 +881,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+ trace_nfs_revalidate_inode_enter(inode);
+
if (is_bad_inode(inode))
goto out;
if (NFS_STALE(inode))
@@ -926,6 +933,7 @@ err_out:
nfs4_label_free(label);
out:
nfs_free_fattr(fattr);
+ trace_nfs_revalidate_inode_exit(inode, status);
return status;
}
@@ -963,9 +971,15 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
-
+ int ret;
+
if (mapping->nrpages != 0) {
- int ret = invalidate_inode_pages2(mapping);
+ if (S_ISREG(inode->i_mode)) {
+ ret = nfs_sync_mapping(mapping);
+ if (ret < 0)
+ return ret;
+ }
+ ret = invalidate_inode_pages2(mapping);
if (ret < 0)
return ret;
}
@@ -976,6 +990,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
spin_unlock(&inode->i_lock);
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
nfs_fscache_wait_on_invalidate(inode);
+
dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
return 0;
@@ -1009,8 +1024,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
if (ret < 0)
goto out;
}
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+ trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
+ trace_nfs_invalidate_mapping_exit(inode, ret);
+ }
+
out:
return ret;
}
@@ -1190,7 +1209,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
{
/* wireshark uses 32-bit AUTODIN crc and does a bitwise
* not on the result */
- return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size);
+ return nfs_fhandle_hash(fh);
}
/*
@@ -1269,9 +1288,17 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
+ int ret;
+
+ trace_nfs_refresh_inode_enter(inode);
+
if (nfs_inode_attrs_need_update(inode, fattr))
- return nfs_update_inode(inode, fattr);
- return nfs_check_inode_attributes(inode, fattr);
+ ret = nfs_update_inode(inode, fattr);
+ else
+ ret = nfs_check_inode_attributes(inode, fattr);
+
+ trace_nfs_refresh_inode_exit(inode, ret);
+ return ret;
}
/**
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3c8373f90ab3..d388302c005f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,7 @@
#include "nfs4_fs.h"
#include <linux/mount.h>
#include <linux/security.h>
+#include <linux/crc32.h>
#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
@@ -185,6 +186,8 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
unsigned int ds_retrans);
+extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
+ struct inode *);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
@@ -267,7 +270,7 @@ extern struct rpc_procinfo nfs4_procedures[];
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr, rpc_authflavor_t authflavour);
+ const char *ip_addr);
/* dir.c */
extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -355,7 +358,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
const char *);
-extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool);
#endif
struct nfs_pgio_completion_ops;
@@ -430,6 +433,8 @@ void nfs_request_remove_commit_list(struct nfs_page *req,
void nfs_init_cinfo(struct nfs_commit_info *cinfo,
struct inode *inode,
struct nfs_direct_req *dreq);
+int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
@@ -451,8 +456,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
extern void __nfs4_read_done_cb(struct nfs_read_data *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr,
- rpc_authflavor_t authflavour);
+ const char *ip_addr);
extern int nfs40_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);
@@ -575,3 +579,22 @@ u64 nfs_timespec_to_change_attr(const struct timespec *ts)
{
return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
}
+
+#ifdef CONFIG_CRC32
+/**
+ * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+ return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
+}
+#else
+static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
+{
+ return 0;
+}
+#endif
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f5c84c3efbca..90cb10d7b693 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -336,8 +336,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
if (flags & O_EXCL) {
data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
- data->arg.create.verifier[0] = jiffies;
- data->arg.create.verifier[1] = current->pid;
+ data->arg.create.verifier[0] = cpu_to_be32(jiffies);
+ data->arg.create.verifier[1] = cpu_to_be32(current->pid);
}
sattr->ia_mode &= ~current_umask();
@@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
}
-static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
{
rpc_call_start(task);
+ return 0;
}
static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
}
-static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
{
rpc_call_start(task);
+ return 0;
}
static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ee81e354bce7..f520a1113b38 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,17 +38,15 @@ struct nfs4_minor_version_ops {
u32 minor_version;
unsigned init_caps;
- int (*call_sync)(struct rpc_clnt *clnt,
- struct nfs_server *server,
- struct rpc_message *msg,
- struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res);
+ int (*init_client)(struct nfs_client *);
+ void (*shutdown_client)(struct nfs_client *);
bool (*match_stateid)(const nfs4_stateid *,
const nfs4_stateid *);
int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
struct nfs_fsinfo *);
int (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
+ const struct rpc_call_ops *call_sync_ops;
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
const struct nfs4_state_recovery_ops *nograce_recovery_ops;
const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -135,6 +133,7 @@ struct nfs4_lock_state {
struct list_head ls_locks; /* Other lock stateids */
struct nfs4_state * ls_state; /* Pointer to open state */
#define NFS_LOCK_INITIALIZED 0
+#define NFS_LOCK_LOST 1
unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
@@ -193,7 +192,6 @@ struct nfs4_state_recovery_ops {
int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
int (*recover_lock)(struct nfs4_state *, struct file_lock *);
int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
- struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
struct rpc_cred *);
@@ -223,7 +221,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
/* nfs4proc.c */
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
-extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
extern int nfs4_destroy_clientid(struct nfs_client *clp);
@@ -248,9 +246,6 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
return server->nfs_client->cl_session;
}
-extern int nfs4_setup_sequence(const struct nfs_server *server,
- struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- struct rpc_task *task);
extern int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
struct rpc_task *task);
@@ -273,18 +268,63 @@ is_ds_client(struct nfs_client *clp)
{
return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
}
-#else /* CONFIG_NFS_v4_1 */
-static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+
+static inline bool
+_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
{
- return NULL;
+ struct rpc_cred *newcred = NULL;
+ rpc_authflavor_t flavor;
+
+ if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
+ spin_lock(&clp->cl_lock);
+ if (clp->cl_machine_cred != NULL)
+ newcred = get_rpccred(clp->cl_machine_cred);
+ spin_unlock(&clp->cl_lock);
+ if (msg->rpc_cred)
+ put_rpccred(msg->rpc_cred);
+ msg->rpc_cred = newcred;
+
+ flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ WARN_ON(flavor != RPC_AUTH_GSS_KRB5I &&
+ flavor != RPC_AUTH_GSS_KRB5P);
+ *clntp = clp->cl_rpcclient;
+
+ return true;
+ }
+ return false;
}
-static inline int nfs4_setup_sequence(const struct nfs_server *server,
- struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- struct rpc_task *task)
+/*
+ * Function responsible for determining if an rpc_message should use the
+ * machine cred under SP4_MACH_CRED and if so switching the credential and
+ * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p).
+ * Should be called before rpc_call_sync/rpc_call_async.
+ */
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
{
- rpc_call_start(task);
- return 0;
+ _nfs4_state_protect(clp, sp4_mode, clntp, msg);
+}
+
+/*
+ * Special wrapper to nfs4_state_protect for write.
+ * If WRITE can use machine cred but COMMIT cannot, make sure all writes
+ * that use machine cred use NFS_FILE_SYNC.
+ */
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+ struct rpc_message *msg, struct nfs_write_data *wdata)
+{
+ if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
+ !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
+ wdata->args.stable = NFS_FILE_SYNC;
+}
+#else /* CONFIG_NFS_v4_1 */
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+ return NULL;
}
static inline bool
@@ -298,6 +338,18 @@ is_ds_client(struct nfs_client *clp)
{
return false;
}
+
+static inline void
+nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
+ struct rpc_clnt **clntp, struct rpc_message *msg)
+{
+}
+
+static inline void
+nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
+ struct rpc_message *msg, struct nfs_write_data *wdata)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
@@ -308,6 +360,10 @@ extern const u32 nfs4_pathconf_bitmap[3];
extern const u32 nfs4_fsinfo_bitmap[3];
extern const u32 nfs4_fs_locations_bitmap[3];
+void nfs40_shutdown_client(struct nfs_client *);
+void nfs41_shutdown_client(struct nfs_client *);
+int nfs40_init_client(struct nfs_client *);
+int nfs41_init_client(struct nfs_client *);
void nfs4_free_client(struct nfs_client *);
struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);
@@ -319,7 +375,7 @@ extern void nfs4_kill_renewd(struct nfs_client *);
extern void nfs4_renew_state(struct work_struct *);
/* nfs4state.c */
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
int nfs4_discover_server_trunking(struct nfs_client *clp,
@@ -327,7 +383,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
int nfs40_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
#if defined(CONFIG_NFS_V4_1)
-struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
int nfs41_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
@@ -382,6 +437,7 @@ struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct
extern bool nfs4_disable_idmapping;
extern unsigned short max_session_slots;
extern unsigned short send_implementation_id;
+extern bool recover_lost_locks;
#define NFS4_CLIENT_ID_UNIQ_LEN (64)
extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
@@ -429,6 +485,8 @@ static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
#define nfs4_close_state(a, b) do { } while (0)
#define nfs4_close_sync(a, b) do { } while (0)
+#define nfs4_state_protect(a, b, c, d) do { } while (0)
+#define nfs4_state_protect_write(a, b, c, d) do { } while (0)
#endif /* CONFIG_NFS_V4 */
#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 90dce91dd5b5..a860ab566d6e 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -41,19 +41,138 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
}
#ifdef CONFIG_NFS_V4_1
-static void nfs4_shutdown_session(struct nfs_client *clp)
+/**
+ * Per auth flavor data server rpc clients
+ */
+struct nfs4_ds_server {
+ struct list_head list; /* ds_clp->cl_ds_clients */
+ struct rpc_clnt *rpc_clnt;
+};
+
+/**
+ * Common lookup case for DS I/O
+ */
+static struct nfs4_ds_server *
+nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+ struct nfs4_ds_server *dss;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) {
+ if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+ continue;
+ goto out;
+ }
+ dss = NULL;
+out:
+ rcu_read_unlock();
+ return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor,
+ struct nfs4_ds_server *new)
+{
+ struct nfs4_ds_server *dss;
+
+ spin_lock(&ds_clp->cl_lock);
+ list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) {
+ if (dss->rpc_clnt->cl_auth->au_flavor != flavor)
+ continue;
+ goto out;
+ }
+ if (new)
+ list_add_rcu(&new->list, &ds_clp->cl_ds_clients);
+ dss = new;
+out:
+ spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */
+ return dss;
+}
+
+static struct nfs4_ds_server *
+nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor)
+{
+ struct nfs4_ds_server *dss;
+
+ dss = kmalloc(sizeof(*dss), GFP_NOFS);
+ if (dss == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor);
+ if (IS_ERR(dss->rpc_clnt)) {
+ int err = PTR_ERR(dss->rpc_clnt);
+ kfree (dss);
+ return ERR_PTR(err);
+ }
+ INIT_LIST_HEAD(&dss->list);
+
+ return dss;
+}
+
+static void
+nfs4_free_ds_server(struct nfs4_ds_server *dss)
+{
+ rpc_release_client(dss->rpc_clnt);
+ kfree(dss);
+}
+
+/**
+* Find or create a DS rpc client with th MDS server rpc client auth flavor
+* in the nfs_client cl_ds_clients list.
+*/
+struct rpc_clnt *
+nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode)
+{
+ struct nfs4_ds_server *dss, *new;
+ rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor;
+
+ dss = nfs4_find_ds_client(ds_clp, flavor);
+ if (dss != NULL)
+ goto out;
+ new = nfs4_alloc_ds_server(ds_clp, flavor);
+ if (IS_ERR(new))
+ return ERR_CAST(new);
+ dss = nfs4_add_ds_client(ds_clp, flavor, new);
+ if (dss != new)
+ nfs4_free_ds_server(new);
+out:
+ return dss->rpc_clnt;
+}
+EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client);
+
+static void
+nfs4_shutdown_ds_clients(struct nfs_client *clp)
+{
+ struct nfs4_ds_server *dss;
+ LIST_HEAD(shutdown_list);
+
+ while (!list_empty(&clp->cl_ds_clients)) {
+ dss = list_entry(clp->cl_ds_clients.next,
+ struct nfs4_ds_server, list);
+ list_del(&dss->list);
+ rpc_shutdown_client(dss->rpc_clnt);
+ kfree (dss);
+ }
+}
+
+void nfs41_shutdown_client(struct nfs_client *clp)
{
if (nfs4_has_session(clp)) {
+ nfs4_shutdown_ds_clients(clp);
nfs4_destroy_session(clp->cl_session);
nfs4_destroy_clientid(clp);
}
}
-#else /* CONFIG_NFS_V4_1 */
-static void nfs4_shutdown_session(struct nfs_client *clp)
+#endif /* CONFIG_NFS_V4_1 */
+
+void nfs40_shutdown_client(struct nfs_client *clp)
{
+ if (clp->cl_slot_tbl) {
+ nfs4_release_slot_table(clp->cl_slot_tbl);
+ kfree(clp->cl_slot_tbl);
+ }
}
-#endif /* CONFIG_NFS_V4_1 */
struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
{
@@ -73,6 +192,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
spin_lock_init(&clp->cl_lock);
INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
+ INIT_LIST_HEAD(&clp->cl_ds_clients);
rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
clp->cl_minorversion = cl_init->minorversion;
@@ -97,7 +217,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
{
if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
nfs4_kill_renewd(clp);
- nfs4_shutdown_session(clp);
+ clp->cl_mvops->shutdown_client(clp);
nfs4_destroy_callback(clp);
if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
nfs_idmap_delete(clp);
@@ -144,34 +264,77 @@ static int nfs4_init_callback(struct nfs_client *clp)
return 0;
}
+/**
+ * nfs40_init_client - nfs_client initialization tasks for NFSv4.0
+ * @clp - nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs40_init_client(struct nfs_client *clp)
+{
+ struct nfs4_slot_table *tbl;
+ int ret;
+
+ tbl = kzalloc(sizeof(*tbl), GFP_NOFS);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
+ "NFSv4.0 transport Slot table");
+ if (ret) {
+ kfree(tbl);
+ return ret;
+ }
+
+ clp->cl_slot_tbl = tbl;
+ return 0;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+/**
+ * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+
+ * @clp - nfs_client to initialize
+ *
+ * Returns zero on success, or a negative errno if some error occurred.
+ */
+int nfs41_init_client(struct nfs_client *clp)
+{
+ struct nfs4_session *session = NULL;
+
+ /*
+ * Create the session and mark it expired.
+ * When a SEQUENCE operation encounters the expired session
+ * it will do session recovery to initialize it.
+ */
+ session = nfs4_alloc_session(clp);
+ if (!session)
+ return -ENOMEM;
+
+ clp->cl_session = session;
+
+ /*
+ * The create session reply races with the server back
+ * channel probe. Mark the client NFS_CS_SESSION_INITING
+ * so that the client back channel can find the
+ * nfs_client struct
+ */
+ nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
+ return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
/*
* Initialize the minor version specific parts of an NFS4 client record
*/
static int nfs4_init_client_minor_version(struct nfs_client *clp)
{
-#if defined(CONFIG_NFS_V4_1)
- if (clp->cl_mvops->minor_version) {
- struct nfs4_session *session = NULL;
- /*
- * Create the session and mark it expired.
- * When a SEQUENCE operation encounters the expired session
- * it will do session recovery to initialize it.
- */
- session = nfs4_alloc_session(clp);
- if (!session)
- return -ENOMEM;
-
- clp->cl_session = session;
- /*
- * The create session reply races with the server back
- * channel probe. Mark the client NFS_CS_SESSION_INITING
- * so that the client back channel can find the
- * nfs_client struct
- */
- nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
- }
-#endif /* CONFIG_NFS_V4_1 */
+ int ret;
+ ret = clp->cl_mvops->init_client(clp);
+ if (ret)
+ return ret;
return nfs4_init_callback(clp);
}
@@ -187,8 +350,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
*/
struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
- const char *ip_addr,
- rpc_authflavor_t authflavour)
+ const char *ip_addr)
{
char buf[INET6_ADDRSTRLEN + 1];
struct nfs_client *old;
@@ -723,7 +885,7 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
}
static int nfs4_server_common_setup(struct nfs_server *server,
- struct nfs_fh *mntfh)
+ struct nfs_fh *mntfh, bool auth_probe)
{
struct nfs_fattr *fattr;
int error;
@@ -755,7 +917,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
/* Probe the root fh to retrieve its FSID and filehandle */
- error = nfs4_get_rootfh(server, mntfh);
+ error = nfs4_get_rootfh(server, mntfh, auth_probe);
if (error < 0)
goto out;
@@ -787,6 +949,7 @@ out:
static int nfs4_init_server(struct nfs_server *server,
const struct nfs_parsed_mount_data *data)
{
+ rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
struct rpc_timeout timeparms;
int error;
@@ -799,13 +962,16 @@ static int nfs4_init_server(struct nfs_server *server,
server->flags = data->flags;
server->options = data->options;
+ if (data->auth_flavor_len >= 1)
+ pseudoflavor = data->auth_flavors[0];
+
/* Get a client record */
error = nfs4_set_client(server,
data->nfs_server.hostname,
(const struct sockaddr *)&data->nfs_server.address,
data->nfs_server.addrlen,
data->client_address,
- data->auth_flavors[0],
+ pseudoflavor,
data->nfs_server.protocol,
&timeparms,
data->minorversion,
@@ -825,7 +991,7 @@ static int nfs4_init_server(struct nfs_server *server,
server->port = data->nfs_server.port;
- error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+ error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor);
error:
/* Done */
@@ -843,6 +1009,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
struct nfs_subversion *nfs_mod)
{
struct nfs_server *server;
+ bool auth_probe;
int error;
dprintk("--> nfs4_create_server()\n");
@@ -851,12 +1018,14 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
if (!server)
return ERR_PTR(-ENOMEM);
+ auth_probe = mount_info->parsed->auth_flavor_len < 1;
+
/* set up the general RPC client */
error = nfs4_init_server(server, mount_info->parsed);
if (error < 0)
goto error;
- error = nfs4_server_common_setup(server, mount_info->mntfh);
+ error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe);
if (error < 0)
goto error;
@@ -909,7 +1078,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (error < 0)
goto error;
- error = nfs4_server_common_setup(server, mntfh);
+ error = nfs4_server_common_setup(server, mntfh,
+ !(parent_server->flags & NFS_MOUNT_SECFLAVOUR));
if (error < 0)
goto error;
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 17ed87ef9de8..b86464ba25e1 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -39,6 +39,7 @@
#include "internal.h"
#include "delegation.h"
#include "nfs4filelayout.h"
+#include "nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -247,6 +248,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr = data->header;
int err;
+ trace_nfs4_pnfs_read(data, task->tk_status);
err = filelayout_async_handle_error(task, data->args.context->state,
data->ds_clp, hdr->lseg);
@@ -363,6 +365,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr = data->header;
int err;
+ trace_nfs4_pnfs_write(data, task->tk_status);
err = filelayout_async_handle_error(task, data->args.context->state,
data->ds_clp, hdr->lseg);
@@ -395,6 +398,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
{
int err;
+ trace_nfs4_pnfs_commit_ds(data, task->tk_status);
err = filelayout_async_handle_error(task, NULL, data->ds_clp,
data->lseg);
@@ -524,6 +528,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
loff_t offset = data->args.offset;
u32 j, idx;
struct nfs_fh *fh;
@@ -538,6 +543,11 @@ filelayout_read_pagelist(struct nfs_read_data *data)
ds = nfs4_fl_prepare_ds(lseg, idx);
if (!ds)
return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
dprintk("%s USE DS: %s cl_count %d\n", __func__,
ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
@@ -552,7 +562,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
data->mds_offset = offset;
/* Perform an asynchronous read to ds */
- nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
+ nfs_initiate_read(ds_clnt, data,
&filelayout_read_call_ops, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
@@ -564,6 +574,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
loff_t offset = data->args.offset;
u32 j, idx;
struct nfs_fh *fh;
@@ -574,6 +585,11 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
ds = nfs4_fl_prepare_ds(lseg, idx);
if (!ds)
return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
@@ -591,7 +607,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
data->args.offset = filelayout_get_dserver_offset(lseg, offset);
/* Perform an asynchronous write */
- nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
+ nfs_initiate_write(ds_clnt, data,
&filelayout_write_call_ops, sync,
RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
@@ -1101,16 +1117,19 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
{
struct pnfs_layout_segment *lseg = data->lseg;
struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
u32 idx;
struct nfs_fh *fh;
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
ds = nfs4_fl_prepare_ds(lseg, idx);
- if (!ds) {
- prepare_to_resend_writes(data);
- filelayout_commit_release(data);
- return -EAGAIN;
- }
+ if (!ds)
+ goto out_err;
+
+ ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_err;
+
dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
data->commit_done_cb = filelayout_commit_done_cb;
@@ -1119,9 +1138,13 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
if (fh)
data->args.fh = fh;
- return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data,
+ return nfs_initiate_commit(ds_clnt, data,
&filelayout_commit_call_ops, how,
RPC_TASK_SOFTCONN);
+out_err:
+ prepare_to_resend_writes(data);
+ filelayout_commit_release(data);
+ return -EAGAIN;
}
static int
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index 549462e5b9b0..c0b3a16b4a00 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -9,7 +9,7 @@
#define NFSDBG_FACILITY NFSDBG_CLIENT
-int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe)
{
struct nfs_fsinfo fsinfo;
int ret = -ENOMEM;
@@ -21,7 +21,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
goto out;
/* Start by getting the root filehandle from the server */
- ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo);
+ ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe);
if (ret < 0) {
dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
goto out;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index cdb0b41a4810..2288cd3c9278 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sunrpc/clnt.h>
@@ -369,21 +370,33 @@ out:
struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
struct nfs_fh *fh, struct nfs_fattr *fattr)
{
+ rpc_authflavor_t flavor = server->client->cl_auth->au_flavor;
struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = parent->d_inode;
+ struct qstr *name = &dentry->d_name;
struct rpc_clnt *client;
struct vfsmount *mnt;
/* Look it up again to get its attributes and sec flavor */
- client = nfs4_proc_lookup_mountpoint(parent->d_inode, &dentry->d_name, fh, fattr);
+ client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr);
dput(parent);
if (IS_ERR(client))
return ERR_CAST(client);
- if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
mnt = nfs_do_refmount(client, dentry);
- else
- mnt = nfs_do_submount(dentry, fh, fattr, client->cl_auth->au_flavor);
+ goto out;
+ }
+ if (client->cl_auth->au_flavor != flavor)
+ flavor = client->cl_auth->au_flavor;
+ else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) {
+ rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
+ if ((int)new >= 0)
+ flavor = new;
+ }
+ mnt = nfs_do_submount(dentry, fh, fattr, flavor);
+out:
rpc_shutdown_client(client);
return mnt;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf11799297c4..39b6cf2d1683 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -66,6 +66,8 @@
#include "nfs4session.h"
#include "fscache.h"
+#include "nfs4trace.h"
+
#define NFSDBG_FACILITY NFSDBG_PROC
#define NFS4_POLL_RETRY_MIN (HZ/10)
@@ -150,6 +152,7 @@ static int nfs4_map_errors(int err)
case -NFS4ERR_RECALLCONFLICT:
return -EREMOTEIO;
case -NFS4ERR_WRONGSEC:
+ case -NFS4ERR_WRONG_CRED:
return -EPERM;
case -NFS4ERR_BADOWNER:
case -NFS4ERR_BADNAME:
@@ -433,6 +436,20 @@ wait_on_recovery:
return ret;
}
+/*
+ * Return 'true' if 'clp' is using an rpc_client that is integrity protected
+ * or 'false' otherwise.
+ */
+static bool _nfs4_is_integrity_protected(struct nfs_client *clp)
+{
+ rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor;
+
+ if (flavor == RPC_AUTH_GSS_KRB5I ||
+ flavor == RPC_AUTH_GSS_KRB5P)
+ return true;
+
+ return false;
+}
static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
{
@@ -447,6 +464,88 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
do_renew_lease(server->nfs_client, timestamp);
}
+struct nfs4_call_sync_data {
+ const struct nfs_server *seq_server;
+ struct nfs4_sequence_args *seq_args;
+ struct nfs4_sequence_res *seq_res;
+};
+
+static void nfs4_init_sequence(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res, int cache_reply)
+{
+ args->sa_slot = NULL;
+ args->sa_cache_this = cache_reply;
+ args->sa_privileged = 0;
+
+ res->sr_slot = NULL;
+}
+
+static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
+{
+ args->sa_privileged = 1;
+}
+
+static int nfs40_setup_sequence(const struct nfs_server *server,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
+{
+ struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
+ struct nfs4_slot *slot;
+
+ /* slot already allocated? */
+ if (res->sr_slot != NULL)
+ goto out_start;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
+ goto out_sleep;
+
+ slot = nfs4_alloc_slot(tbl);
+ if (IS_ERR(slot)) {
+ if (slot == ERR_PTR(-ENOMEM))
+ task->tk_timeout = HZ >> 2;
+ goto out_sleep;
+ }
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ args->sa_slot = slot;
+ res->sr_slot = slot;
+
+out_start:
+ rpc_call_start(task);
+ return 0;
+
+out_sleep:
+ if (args->sa_privileged)
+ rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
+ NULL, RPC_PRIORITY_PRIVILEGED);
+ else
+ rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+ spin_unlock(&tbl->slot_tbl_lock);
+ return -EAGAIN;
+}
+
+static int nfs40_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ struct nfs4_slot *slot = res->sr_slot;
+ struct nfs4_slot_table *tbl;
+
+ if (!RPC_WAS_SENT(task))
+ goto out;
+
+ tbl = slot->table;
+ spin_lock(&tbl->slot_tbl_lock);
+ if (!nfs41_wake_and_assign_slot(tbl, slot))
+ nfs4_free_slot(tbl, slot);
+ spin_unlock(&tbl->slot_tbl_lock);
+
+ res->sr_slot = NULL;
+out:
+ return 1;
+}
+
#if defined(CONFIG_NFS_V4_1)
static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
@@ -506,6 +605,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
interrupted = true;
}
+ trace_nfs4_sequence_done(session, res);
/* Check the SEQUENCE operation status */
switch (res->sr_status) {
case 0:
@@ -591,25 +691,11 @@ static int nfs4_sequence_done(struct rpc_task *task,
{
if (res->sr_slot == NULL)
return 1;
+ if (!res->sr_slot->table->session)
+ return nfs40_sequence_done(task, res);
return nfs41_sequence_done(task, res);
}
-static void nfs41_init_sequence(struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res, int cache_reply)
-{
- args->sa_slot = NULL;
- args->sa_cache_this = 0;
- args->sa_privileged = 0;
- if (cache_reply)
- args->sa_cache_this = 1;
- res->sr_slot = NULL;
-}
-
-static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
-{
- args->sa_privileged = 1;
-}
-
int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res,
@@ -647,7 +733,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
args->sa_slot = slot;
- dprintk("<-- %s slotid=%d seqid=%d\n", __func__,
+ dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
slot->slot_nr, slot->seq_nr);
res->sr_slot = slot;
@@ -658,6 +744,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
* set to 1 if an rpc level failure occurs.
*/
res->sr_status = 1;
+ trace_nfs4_setup_sequence(session, args);
out_success:
rpc_call_start(task);
return 0;
@@ -673,38 +760,30 @@ out_sleep:
}
EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
-int nfs4_setup_sequence(const struct nfs_server *server,
- struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res,
- struct rpc_task *task)
+static int nfs4_setup_sequence(const struct nfs_server *server,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
{
struct nfs4_session *session = nfs4_get_session(server);
int ret = 0;
- if (session == NULL) {
- rpc_call_start(task);
- goto out;
- }
+ if (!session)
+ return nfs40_setup_sequence(server, args, res, task);
- dprintk("--> %s clp %p session %p sr_slot %d\n",
+ dprintk("--> %s clp %p session %p sr_slot %u\n",
__func__, session->clp, session, res->sr_slot ?
- res->sr_slot->slot_nr : -1);
+ res->sr_slot->slot_nr : NFS4_NO_SLOT);
ret = nfs41_setup_sequence(session, args, res, task);
-out:
+
dprintk("<-- %s status=%d\n", __func__, ret);
return ret;
}
-struct nfs41_call_sync_data {
- const struct nfs_server *seq_server;
- struct nfs4_sequence_args *seq_args;
- struct nfs4_sequence_res *seq_res;
-};
-
static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
{
- struct nfs41_call_sync_data *data = calldata;
+ struct nfs4_call_sync_data *data = calldata;
struct nfs4_session *session = nfs4_get_session(data->seq_server);
dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
@@ -714,7 +793,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
{
- struct nfs41_call_sync_data *data = calldata;
+ struct nfs4_call_sync_data *data = calldata;
nfs41_sequence_done(task, data->seq_res);
}
@@ -724,6 +803,42 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
.rpc_call_done = nfs41_call_sync_done,
};
+#else /* !CONFIG_NFS_V4_1 */
+
+static int nfs4_setup_sequence(const struct nfs_server *server,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
+{
+ return nfs40_setup_sequence(server, args, res, task);
+}
+
+static int nfs4_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ return nfs40_sequence_done(task, res);
+}
+
+#endif /* !CONFIG_NFS_V4_1 */
+
+static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+ nfs4_setup_sequence(data->seq_server,
+ data->seq_args, data->seq_res, task);
+}
+
+static void nfs40_call_sync_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_call_sync_data *data = calldata;
+ nfs4_sequence_done(task, data->seq_res);
+}
+
+static const struct rpc_call_ops nfs40_call_sync_ops = {
+ .rpc_call_prepare = nfs40_call_sync_prepare,
+ .rpc_call_done = nfs40_call_sync_done,
+};
+
static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
struct nfs_server *server,
struct rpc_message *msg,
@@ -732,7 +847,8 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
{
int ret;
struct rpc_task *task;
- struct nfs41_call_sync_data data = {
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_call_sync_data data = {
.seq_server = server,
.seq_args = args,
.seq_res = res,
@@ -740,7 +856,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
struct rpc_task_setup task_setup = {
.rpc_client = clnt,
.rpc_message = msg,
- .callback_ops = &nfs41_call_sync_ops,
+ .callback_ops = clp->cl_mvops->call_sync_ops,
.callback_data = &data
};
@@ -754,35 +870,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
return ret;
}
-#else
-static
-void nfs41_init_sequence(struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res, int cache_reply)
-{
-}
-
-static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
-{
-}
-
-
-static int nfs4_sequence_done(struct rpc_task *task,
- struct nfs4_sequence_res *res)
-{
- return 1;
-}
-#endif /* CONFIG_NFS_V4_1 */
-
-static
-int _nfs4_call_sync(struct rpc_clnt *clnt,
- struct nfs_server *server,
- struct rpc_message *msg,
- struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res)
-{
- return rpc_call_sync(clnt, msg, 0);
-}
-
static
int nfs4_call_sync(struct rpc_clnt *clnt,
struct nfs_server *server,
@@ -791,9 +878,8 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
struct nfs4_sequence_res *res,
int cache_reply)
{
- nfs41_init_sequence(args, res, cache_reply);
- return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
- args, res);
+ nfs4_init_sequence(args, res, cache_reply);
+ return nfs4_call_sync_sequence(clnt, server, msg, args, res);
}
static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -933,7 +1019,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.fh = NFS_FH(dentry->d_inode);
}
if (attrs != NULL && attrs->ia_valid != 0) {
- __be32 verf[2];
+ __u32 verf[2];
p->o_arg.u.attrs = &p->attrs;
memcpy(&p->attrs, attrs, sizeof(p->attrs));
@@ -1103,7 +1189,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
goto no_delegation;
spin_lock(&deleg_cur->lock);
- if (nfsi->delegation != deleg_cur ||
+ if (rcu_dereference(nfsi->delegation) != deleg_cur ||
test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
(deleg_cur->type & fmode) != fmode)
goto no_delegation_unlock;
@@ -1440,6 +1526,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
int err;
do {
err = _nfs4_do_open_reclaim(ctx, state);
+ trace_nfs4_open_reclaim(ctx, 0, err);
if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
continue;
if (err != -NFS4ERR_DELAY)
@@ -1524,10 +1611,20 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
return nfs4_handle_delegation_recall_error(server, state, stateid, err);
}
+static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_opendata *data = calldata;
+
+ nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args,
+ &data->o_res.seq_res, task);
+}
+
static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
{
struct nfs4_opendata *data = calldata;
+ nfs40_sequence_done(task, &data->o_res.seq_res);
+
data->rpc_status = task->tk_status;
if (data->rpc_status == 0) {
nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
@@ -1556,6 +1653,7 @@ out_free:
}
static const struct rpc_call_ops nfs4_open_confirm_ops = {
+ .rpc_call_prepare = nfs4_open_confirm_prepare,
.rpc_call_done = nfs4_open_confirm_done,
.rpc_release = nfs4_open_confirm_release,
};
@@ -1583,6 +1681,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
};
int status;
+ nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1);
kref_get(&data->kref);
data->rpc_done = 0;
data->rpc_status = 0;
@@ -1742,7 +1841,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
};
int status;
- nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);
+ nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);
kref_get(&data->kref);
data->rpc_done = 0;
data->rpc_status = 0;
@@ -1895,6 +1994,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
do {
err = _nfs4_open_expired(ctx, state);
+ trace_nfs4_open_expired(ctx, 0, err);
if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
continue;
switch (err) {
@@ -1944,6 +2044,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
cred = get_rpccred(delegation->cred);
rcu_read_unlock();
status = nfs41_test_stateid(server, stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
} else
rcu_read_unlock();
@@ -1986,6 +2087,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
return -NFS4ERR_BAD_STATEID;
status = nfs41_test_stateid(server, stateid, cred);
+ trace_nfs4_test_open_stateid(state, NULL, status);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
@@ -2197,6 +2299,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
do {
status = _nfs4_do_open(dir, ctx, flags, sattr, label);
res = ctx->state;
+ trace_nfs4_open_file(ctx, flags, status);
if (status == 0)
break;
/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2310,6 +2413,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
int err;
do {
err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
+ trace_nfs4_setattr(inode, err);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2387,6 +2491,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
dprintk("%s: begin!\n", __func__);
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
return;
+ trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
/* hmm. we are done with the inode, and in the process of freeing
* the state_owner. we keep this around to process errors
*/
@@ -2511,10 +2616,13 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
};
int status = -ENOMEM;
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
calldata = kzalloc(sizeof(*calldata), gfp_mask);
if (calldata == NULL)
goto out;
- nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);
+ nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);
calldata->inode = state->inode;
calldata->state = state;
calldata->arg.fh = NFS_FH(state->inode);
@@ -2690,6 +2798,7 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
int err;
do {
err = _nfs4_lookup_root(server, fhandle, info);
+ trace_nfs4_lookup_root(server, fhandle, info->fattr, err);
switch (err) {
case 0:
case -NFS4ERR_WRONGSEC:
@@ -2705,10 +2814,13 @@ out:
static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info, rpc_authflavor_t flavor)
{
+ struct rpc_auth_create_args auth_args = {
+ .pseudoflavor = flavor,
+ };
struct rpc_auth *auth;
int ret;
- auth = rpcauth_create(flavor, server->client);
+ auth = rpcauth_create(&auth_args, server->client);
if (IS_ERR(auth)) {
ret = -EACCES;
goto out;
@@ -2772,18 +2884,27 @@ static int nfs4_do_find_root_sec(struct nfs_server *server,
* @server: initialized nfs_server handle
* @fhandle: we fill in the pseudo-fs root file handle
* @info: we fill in an FSINFO struct
+ * @auth_probe: probe the auth flavours
*
* Returns zero on success, or a negative errno.
*/
int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info)
+ struct nfs_fsinfo *info,
+ bool auth_probe)
{
int status;
- status = nfs4_lookup_root(server, fhandle, info);
- if ((status == -NFS4ERR_WRONGSEC) &&
- !(server->flags & NFS_MOUNT_SECFLAVOUR))
+ switch (auth_probe) {
+ case false:
+ status = nfs4_lookup_root(server, fhandle, info);
+ if (status != -NFS4ERR_WRONGSEC)
+ break;
+ /* Did user force a 'sec=' mount option? */
+ if (server->flags & NFS_MOUNT_SECFLAVOUR)
+ break;
+ default:
status = nfs4_do_find_root_sec(server, fhandle, info);
+ }
if (status == 0)
status = nfs4_server_capabilities(server, fhandle);
@@ -2899,8 +3020,9 @@ static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(server,
- _nfs4_proc_getattr(server, fhandle, fattr, label),
+ err = _nfs4_proc_getattr(server, fhandle, fattr, label);
+ trace_nfs4_getattr(server, fhandle, fattr, err);
+ err = nfs4_handle_exception(server, err,
&exception);
} while (exception.retry);
return err;
@@ -2940,10 +3062,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
/* Deal with open(O_TRUNC) */
if (sattr->ia_valid & ATTR_OPEN)
- sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+ sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME);
/* Optimization: if the end result is no change, don't RPC */
- if ((sattr->ia_valid & ~(ATTR_FILE)) == 0)
+ if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
return 0;
/* Search for an existing open(O_WRITE) file */
@@ -3020,6 +3142,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
int err;
do {
err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
+ trace_nfs4_lookup(dir, name, err);
switch (err) {
case -NFS4ERR_BADNAME:
err = -ENOENT;
@@ -3031,7 +3154,9 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
err = -EPERM;
if (client != *clnt)
goto out;
-
+ /* No security negotiation if the user specified 'sec=' */
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR)
+ goto out;
client = nfs4_create_sec_client(client, dir, name);
if (IS_ERR(client))
return PTR_ERR(client);
@@ -3071,15 +3196,13 @@ struct rpc_clnt *
nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
+ struct rpc_clnt *client = NFS_CLIENT(dir);
int status;
- struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
- if (status < 0) {
- rpc_shutdown_client(client);
+ if (status < 0)
return ERR_PTR(status);
- }
- return client;
+ return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
}
static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
@@ -3136,8 +3259,9 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(inode),
- _nfs4_proc_access(inode, entry),
+ err = _nfs4_proc_access(inode, entry);
+ trace_nfs4_access(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
} while (exception.retry);
return err;
@@ -3190,8 +3314,9 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(inode),
- _nfs4_proc_readlink(inode, page, pgbase, pglen),
+ err = _nfs4_proc_readlink(inode, page, pgbase, pglen);
+ trace_nfs4_readlink(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
} while (exception.retry);
return err;
@@ -3255,8 +3380,9 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_remove(dir, name),
+ err = _nfs4_proc_remove(dir, name);
+ trace_nfs4_remove(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
return err;
@@ -3270,7 +3396,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
res->server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
- nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+ nfs4_init_sequence(&args->seq_args, &res->seq_res, 1);
nfs_fattr_init(res->dir_attr);
}
@@ -3285,7 +3411,8 @@ static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlin
static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
{
- struct nfs_removeres *res = task->tk_msg.rpc_resp;
+ struct nfs_unlinkdata *data = task->tk_calldata;
+ struct nfs_removeres *res = &data->res;
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
@@ -3303,7 +3430,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
res->server = server;
- nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1);
+ nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1);
}
static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
@@ -3317,7 +3444,8 @@ static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renam
static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
struct inode *new_dir)
{
- struct nfs_renameres *res = task->tk_msg.rpc_resp;
+ struct nfs_renamedata *data = task->tk_calldata;
+ struct nfs_renameres *res = &data->res;
if (!nfs4_sequence_done(task, &res->seq_res))
return 0;
@@ -3363,9 +3491,10 @@ static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(old_dir),
- _nfs4_proc_rename(old_dir, old_name,
- new_dir, new_name),
+ err = _nfs4_proc_rename(old_dir, old_name,
+ new_dir, new_name);
+ trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(old_dir), err,
&exception);
} while (exception.retry);
return err;
@@ -3527,9 +3656,9 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
label = nfs4_label_init_security(dir, dentry, sattr, &l);
do {
- err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_symlink(dir, dentry, page,
- len, sattr, label),
+ err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+ trace_nfs4_symlink(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
@@ -3566,8 +3695,9 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
sattr->ia_mode &= ~current_umask();
do {
- err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mkdir(dir, dentry, sattr, label),
+ err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ trace_nfs4_mkdir(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
nfs4_label_release_security(label);
@@ -3620,9 +3750,10 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
- _nfs4_proc_readdir(dentry, cred, cookie,
- pages, count, plus),
+ err = _nfs4_proc_readdir(dentry, cred, cookie,
+ pages, count, plus);
+ trace_nfs4_readdir(dentry->d_inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err,
&exception);
} while (exception.retry);
return err;
@@ -3674,8 +3805,9 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
sattr->ia_mode &= ~current_umask();
do {
- err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
+ err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
+ trace_nfs4_mknod(dir, &dentry->d_name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
@@ -3743,6 +3875,7 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
do {
err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
+ trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
if (err == 0) {
struct nfs_client *clp = server->nfs_client;
@@ -3861,6 +3994,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->header->inode);
+ trace_nfs4_read(data, task->tk_status);
if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -3904,24 +4038,29 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
data->timestamp = jiffies;
data->read_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
- nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
}
-static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
{
if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
&data->res.seq_res,
task))
- return;
- nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_READ);
+ return 0;
+ if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+ data->args.lock_context, FMODE_READ) == -EIO)
+ return -EIO;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ return -EIO;
+ return 0;
}
static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->header->inode;
+ trace_nfs4_write(data, task->tk_status);
if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -3987,18 +4126,22 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
data->timestamp = jiffies;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
}
-static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
{
if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
&data->res.seq_res,
task))
- return;
- nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_WRITE);
+ return 0;
+ if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+ data->args.lock_context, FMODE_WRITE) == -EIO)
+ return -EIO;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ return -EIO;
+ return 0;
}
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4013,6 +4156,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
{
struct inode *inode = data->inode;
+ trace_nfs4_commit(data, task->tk_status);
if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
rpc_restart_call_prepare(task);
return -EAGAIN;
@@ -4035,7 +4179,7 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
data->commit_done_cb = nfs4_commit_done_cb;
data->res.server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
- nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
}
struct nfs4_renewdata {
@@ -4064,6 +4208,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
struct nfs_client *clp = data->client;
unsigned long timestamp = data->timestamp;
+ trace_nfs4_renew_async(clp, task->tk_status);
if (task->tk_status < 0) {
/* Unless we're shutting down, schedule state recovery! */
if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
@@ -4321,6 +4466,7 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl
ssize_t ret;
do {
ret = __nfs4_get_acl_uncached(inode, buf, buflen);
+ trace_nfs4_get_acl(inode, ret);
if (ret >= 0)
break;
ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception);
@@ -4400,8 +4546,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(inode),
- __nfs4_proc_set_acl(inode, buf, buflen),
+ err = __nfs4_proc_set_acl(inode, buf, buflen);
+ trace_nfs4_set_acl(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
} while (exception.retry);
return err;
@@ -4454,8 +4601,9 @@ static int nfs4_get_security_label(struct inode *inode, void *buf,
return -EOPNOTSUPP;
do {
- err = nfs4_handle_exception(NFS_SERVER(inode),
- _nfs4_get_security_label(inode, buf, buflen),
+ err = _nfs4_get_security_label(inode, buf, buflen);
+ trace_nfs4_get_security_label(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
} while (exception.retry);
return err;
@@ -4507,9 +4655,10 @@ static int nfs4_do_set_security_label(struct inode *inode,
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(inode),
- _nfs4_do_set_security_label(inode, ilabel,
- fattr, olabel),
+ err = _nfs4_do_set_security_label(inode, ilabel,
+ fattr, olabel);
+ trace_nfs4_set_security_label(inode, err);
+ err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
} while (exception.retry);
return err;
@@ -4632,11 +4781,11 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
/* An impossible timestamp guarantees this value
* will never match a generated boot time. */
verf[0] = 0;
- verf[1] = (__be32)(NSEC_PER_SEC + 1);
+ verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
} else {
struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
- verf[0] = (__be32)nn->boot_time.tv_sec;
- verf[1] = (__be32)nn->boot_time.tv_nsec;
+ verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
+ verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
}
memcpy(bootverf->data, verf, sizeof(bootverf->data));
}
@@ -4662,10 +4811,14 @@ static unsigned int
nfs4_init_uniform_client_string(const struct nfs_client *clp,
char *buf, size_t len)
{
- char *nodename = clp->cl_rpcclient->cl_nodename;
+ const char *nodename = clp->cl_rpcclient->cl_nodename;
if (nfs4_client_id_uniquifier[0] != '\0')
- nodename = nfs4_client_id_uniquifier;
+ return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
+ clp->rpc_ops->version,
+ clp->cl_minorversion,
+ nfs4_client_id_uniquifier,
+ nodename);
return scnprintf(buf, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
nodename);
@@ -4726,6 +4879,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
clp->cl_rpcclient->cl_auth->au_ops->au_name,
setclientid.sc_name_len, setclientid.sc_name);
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ trace_nfs4_setclientid(clp, status);
dprintk("NFS reply setclientid: %d\n", status);
return status;
}
@@ -4753,6 +4907,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
clp->cl_rpcclient->cl_auth->au_ops->au_name,
clp->cl_clientid);
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ trace_nfs4_setclientid_confirm(clp, status);
dprintk("NFS reply setclientid_confirm: %d\n", status);
return status;
}
@@ -4774,6 +4929,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
+ trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
switch (task->tk_status) {
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
@@ -4795,7 +4951,6 @@ static void nfs4_delegreturn_release(void *calldata)
kfree(calldata);
}
-#if defined(CONFIG_NFS_V4_1)
static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
{
struct nfs4_delegreturndata *d_data;
@@ -4807,12 +4962,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
&d_data->res.seq_res,
task);
}
-#endif /* CONFIG_NFS_V4_1 */
static const struct rpc_call_ops nfs4_delegreturn_ops = {
-#if defined(CONFIG_NFS_V4_1)
.rpc_call_prepare = nfs4_delegreturn_prepare,
-#endif /* CONFIG_NFS_V4_1 */
.rpc_call_done = nfs4_delegreturn_done,
.rpc_release = nfs4_delegreturn_release,
};
@@ -4837,7 +4989,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
data = kzalloc(sizeof(*data), GFP_NOFS);
if (data == NULL)
return -ENOMEM;
- nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
@@ -4877,6 +5029,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
int err;
do {
err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
+ trace_nfs4_delegreturn(inode, err);
switch (err) {
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
@@ -4951,8 +5104,9 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(state->inode),
- _nfs4_proc_getlk(state, cmd, request),
+ err = _nfs4_proc_getlk(state, cmd, request);
+ trace_nfs4_get_lock(request, state, cmd, err);
+ err = nfs4_handle_exception(NFS_SERVER(state->inode), err,
&exception);
} while (exception.retry);
return err;
@@ -5089,6 +5243,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
.flags = RPC_TASK_ASYNC,
};
+ nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg);
+
/* Ensure this is an unlock - when canceling a lock, the
* canceled lock is passed in, and it won't be an unlock.
*/
@@ -5100,7 +5257,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
return ERR_PTR(-ENOMEM);
}
- nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
msg.rpc_argp = &data->arg;
msg.rpc_resp = &data->res;
task_setup_data.callback_data = data;
@@ -5150,6 +5307,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
rpc_put_task(task);
out:
request->fl_flags = fl_flags;
+ trace_nfs4_unlock(request, state, F_SETLK, status);
return status;
}
@@ -5335,7 +5493,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
return -ENOMEM;
if (IS_SETLKW(cmd))
data->arg.block = 1;
- nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
msg.rpc_argp = &data->arg;
msg.rpc_resp = &data->res;
task_setup_data.callback_data = data;
@@ -5373,6 +5531,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
+ trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -5391,10 +5550,15 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
err = nfs4_set_lock_state(state, request);
if (err != 0)
return err;
+ if (!recover_lost_locks) {
+ set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
+ return 0;
+ }
do {
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
+ trace_nfs4_lock_expired(request, state, F_SETLK, err);
switch (err) {
default:
goto out;
@@ -5430,6 +5594,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
status = nfs41_test_stateid(server,
&lsp->ls_stateid,
cred);
+ trace_nfs4_test_lock_stateid(state, lsp, status);
if (status != NFS_OK) {
/* Free the stateid unless the server
* informs us the stateid is unrecognized. */
@@ -5517,6 +5682,7 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
do {
err = _nfs4_proc_setlk(state, cmd, request);
+ trace_nfs4_set_lock(request, state, cmd, err);
if (err == -NFS4ERR_DENIED)
err = -EAGAIN;
err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -5599,8 +5765,23 @@ struct nfs_release_lockowner_data {
struct nfs4_lock_state *lsp;
struct nfs_server *server;
struct nfs_release_lockowner_args args;
+ struct nfs4_sequence_args seq_args;
+ struct nfs4_sequence_res seq_res;
};
+static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ nfs40_setup_sequence(data->server,
+ &data->seq_args, &data->seq_res, task);
+}
+
+static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs_release_lockowner_data *data = calldata;
+ nfs40_sequence_done(task, &data->seq_res);
+}
+
static void nfs4_release_lockowner_release(void *calldata)
{
struct nfs_release_lockowner_data *data = calldata;
@@ -5609,6 +5790,8 @@ static void nfs4_release_lockowner_release(void *calldata)
}
static const struct rpc_call_ops nfs4_release_lockowner_ops = {
+ .rpc_call_prepare = nfs4_release_lockowner_prepare,
+ .rpc_call_done = nfs4_release_lockowner_done,
.rpc_release = nfs4_release_lockowner_release,
};
@@ -5621,14 +5804,17 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
if (server->nfs_client->cl_mvops->minor_version != 0)
return -EINVAL;
+
data = kmalloc(sizeof(*data), GFP_NOFS);
if (!data)
return -ENOMEM;
+ nfs4_init_sequence(&data->seq_args, &data->seq_res, 0);
data->lsp = lsp;
data->server = server;
data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
data->args.lock_owner.id = lsp->ls_seqid.owner_id;
data->args.lock_owner.s_dev = server->s_dev;
+
msg.rpc_argp = &data->args;
rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
return 0;
@@ -5783,14 +5969,23 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_fs_locations(client, dir, name, fs_locations, page),
+ err = _nfs4_proc_fs_locations(client, dir, name,
+ fs_locations, page);
+ trace_nfs4_get_fs_locations(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
return err;
}
-static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
+/**
+ * If 'use_integrity' is true and the state managment nfs_client
+ * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
+ * and the machine credential as per RFC3530bis and RFC5661 Security
+ * Considerations sections. Otherwise, just use the user cred with the
+ * filesystem's rpc_client.
+ */
+static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
{
int status;
struct nfs4_secinfo_arg args = {
@@ -5805,10 +6000,25 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
.rpc_argp = &args,
.rpc_resp = &res,
};
+ struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+
+ if (use_integrity) {
+ clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
+ msg.rpc_cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
+ }
dprintk("NFS call secinfo %s\n", name->name);
- status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
+
+ nfs4_state_protect(NFS_SERVER(dir)->nfs_client,
+ NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+
+ status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
+ &res.seq_res, 0);
dprintk("NFS reply secinfo: %d\n", status);
+
+ if (msg.rpc_cred)
+ put_rpccred(msg.rpc_cred);
+
return status;
}
@@ -5818,8 +6028,23 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
struct nfs4_exception exception = { };
int err;
do {
- err = nfs4_handle_exception(NFS_SERVER(dir),
- _nfs4_proc_secinfo(dir, name, flavors),
+ err = -NFS4ERR_WRONGSEC;
+
+ /* try to use integrity protection with machine cred */
+ if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client))
+ err = _nfs4_proc_secinfo(dir, name, flavors, true);
+
+ /*
+ * if unable to use integrity protection, or SECINFO with
+ * integrity protection returns NFS4ERR_WRONGSEC (which is
+ * disallowed by spec, but exists in deployed servers) use
+ * the current filesystem's rpc_client and the user cred.
+ */
+ if (err == -NFS4ERR_WRONGSEC)
+ err = _nfs4_proc_secinfo(dir, name, flavors, false);
+
+ trace_nfs4_secinfo(dir, name, err);
+ err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
return err;
@@ -5883,6 +6108,7 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
}
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ trace_nfs4_bind_conn_to_session(clp, status);
if (status == 0) {
if (memcmp(res.session->sess_id.data,
clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
@@ -5911,16 +6137,124 @@ out:
}
/*
- * nfs4_proc_exchange_id()
+ * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
+ * and operations we'd like to see to enable certain features in the allow map
+ */
+static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
+ .how = SP4_MACH_CRED,
+ .enforce.u.words = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+ },
+ .allow.u.words = {
+ [0] = 1 << (OP_CLOSE) |
+ 1 << (OP_LOCKU),
+ [1] = 1 << (OP_SECINFO - 32) |
+ 1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_TEST_STATEID - 32) |
+ 1 << (OP_FREE_STATEID - 32)
+ }
+};
+
+/*
+ * Select the state protection mode for client `clp' given the server results
+ * from exchange_id in `sp'.
*
- * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ * Returns 0 on success, negative errno otherwise.
+ */
+static int nfs4_sp4_select_mode(struct nfs_client *clp,
+ struct nfs41_state_protection *sp)
+{
+ static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = {
+ [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32)
+ };
+ unsigned int i;
+
+ if (sp->how == SP4_MACH_CRED) {
+ /* Print state protect result */
+ dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n");
+ for (i = 0; i <= LAST_NFS4_OP; i++) {
+ if (test_bit(i, sp->enforce.u.longs))
+ dfprintk(MOUNT, " enforce op %d\n", i);
+ if (test_bit(i, sp->allow.u.longs))
+ dfprintk(MOUNT, " allow op %d\n", i);
+ }
+
+ /* make sure nothing is on enforce list that isn't supported */
+ for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
+ if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
+ dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Minimal mode - state operations are allowed to use machine
+ * credential. Note this already happens by default, so the
+ * client doesn't have to do anything more than the negotiation.
+ *
+ * NOTE: we don't care if EXCHANGE_ID is in the list -
+ * we're already using the machine cred for exchange_id
+ * and will never use a different cred.
+ */
+ if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) &&
+ test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
+ dfprintk(MOUNT, "sp4_mach_cred:\n");
+ dfprintk(MOUNT, " minimal mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags);
+ } else {
+ dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
+ return -EINVAL;
+ }
+
+ if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_LOCKU, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " cleanup mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
+ }
+
+ if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
+ test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " secinfo mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags);
+ }
+
+ if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
+ test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " stateid mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags);
+ }
+
+ if (test_bit(OP_WRITE, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " write mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags);
+ }
+
+ if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " commit mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * _nfs4_proc_exchange_id()
*
- * Since the clientid has expired, all compounds using sessions
- * associated with the stale clientid will be returning
- * NFS4ERR_BADSESSION in the sequence operation, and will therefore
- * be in some phase of session reset.
+ * Wrapper for EXCHANGE_ID operation.
*/
-int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+ u32 sp4_how)
{
nfs4_verifier verifier;
struct nfs41_exchange_id_args args = {
@@ -5967,10 +6301,30 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
goto out_server_scope;
}
+ switch (sp4_how) {
+ case SP4_NONE:
+ args.state_protect.how = SP4_NONE;
+ break;
+
+ case SP4_MACH_CRED:
+ args.state_protect = nfs4_sp4_mach_cred_request;
+ break;
+
+ default:
+ /* unsupported! */
+ WARN_ON_ONCE(1);
+ status = -EINVAL;
+ goto out_server_scope;
+ }
+
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ trace_nfs4_exchange_id(clp, status);
if (status == 0)
status = nfs4_check_cl_exchange_flags(res.flags);
+ if (status == 0)
+ status = nfs4_sp4_select_mode(clp, &res.state_protect);
+
if (status == 0) {
clp->cl_clientid = res.clientid;
clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R);
@@ -6017,6 +6371,35 @@ out:
return status;
}
+/*
+ * nfs4_proc_exchange_id()
+ *
+ * Returns zero, a negative errno, or a negative NFS4ERR status code.
+ *
+ * Since the clientid has expired, all compounds using sessions
+ * associated with the stale clientid will be returning
+ * NFS4ERR_BADSESSION in the sequence operation, and will therefore
+ * be in some phase of session reset.
+ *
+ * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used.
+ */
+int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+{
+ rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor;
+ int status;
+
+ /* try SP4_MACH_CRED if krb5i/p */
+ if (authflavor == RPC_AUTH_GSS_KRB5I ||
+ authflavor == RPC_AUTH_GSS_KRB5P) {
+ status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+ if (!status)
+ return 0;
+ }
+
+ /* try SP4_NONE */
+ return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+}
+
static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
struct rpc_cred *cred)
{
@@ -6028,6 +6411,7 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
int status;
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ trace_nfs4_destroy_clientid(clp, status);
if (status)
dprintk("NFS: Got error %d from the server %s on "
"DESTROY_CLIENTID.", status, clp->cl_hostname);
@@ -6065,7 +6449,7 @@ int nfs4_destroy_clientid(struct nfs_client *clp)
goto out;
if (clp->cl_preserve_clid)
goto out;
- cred = nfs4_get_exchange_id_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
ret = nfs4_proc_destroy_clientid(clp, cred);
if (cred)
put_rpccred(cred);
@@ -6157,7 +6541,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
};
int status;
- nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
+ nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
nfs4_set_sequence_privileged(&args.la_seq_args);
dprintk("--> %s\n", __func__);
task = rpc_run_task(&task_setup);
@@ -6291,6 +6675,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ trace_nfs4_create_session(clp, status);
if (!status) {
/* Verify the session's negotiated channel_attrs values */
@@ -6354,6 +6739,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
return status;
status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ trace_nfs4_destroy_session(session->clp, status);
if (status)
dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
@@ -6403,6 +6789,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
return;
+ trace_nfs4_sequence(clp, task->tk_status);
if (task->tk_status < 0) {
dprintk("%s ERROR %d\n", __func__, task->tk_status);
if (atomic_read(&clp->cl_count) == 1)
@@ -6460,7 +6847,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
nfs_put_client(clp);
return ERR_PTR(-ENOMEM);
}
- nfs41_init_sequence(&calldata->args, &calldata->res, 0);
+ nfs4_init_sequence(&calldata->args, &calldata->res, 0);
if (is_privileged)
nfs4_set_sequence_privileged(&calldata->args);
msg.rpc_argp = &calldata->args;
@@ -6555,6 +6942,7 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
if (!nfs41_sequence_done(task, res))
return;
+ trace_nfs4_reclaim_complete(clp, task->tk_status);
if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
rpc_restart_call_prepare(task);
return;
@@ -6602,7 +6990,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
calldata->clp = clp;
calldata->arg.one_fs = 0;
- nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
+ nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
nfs4_set_sequence_privileged(&calldata->arg.seq_args);
msg.rpc_argp = &calldata->arg;
msg.rpc_resp = &calldata->res;
@@ -6793,7 +7181,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
lgp->res.layoutp = &lgp->args.layout;
lgp->res.seq_res.sr_slot = NULL;
- nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
+ nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
/* nfs4_layoutget_release calls pnfs_put_layout_hdr */
pnfs_get_layout_hdr(NFS_I(inode)->layout);
@@ -6804,6 +7192,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
status = nfs4_wait_for_completion_rpc_task(task);
if (status == 0)
status = task->tk_status;
+ trace_nfs4_layoutget(lgp->args.ctx,
+ &lgp->args.range,
+ &lgp->res.range,
+ status);
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
@@ -6876,7 +7268,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
.rpc_cred = lrp->cred,
};
struct rpc_task_setup task_setup_data = {
- .rpc_client = lrp->clp->cl_rpcclient,
+ .rpc_client = NFS_SERVER(lrp->args.inode)->client,
.rpc_message = &msg,
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
@@ -6884,11 +7276,12 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
int status;
dprintk("--> %s\n", __func__);
- nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
status = task->tk_status;
+ trace_nfs4_layoutreturn(lrp->args.inode, status);
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -7065,7 +7458,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
data->args.lastbytewritten,
data->args.inode->i_ino);
- nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -7075,15 +7468,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
if (status != 0)
goto out;
status = task->tk_status;
+ trace_nfs4_layoutcommit(data->args.inode, status);
out:
dprintk("%s: status %d\n", __func__, status);
rpc_put_task(task);
return status;
}
+/**
+ * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
+ * possible) as per RFC3530bis and RFC5661 Security Considerations sections
+ */
static int
_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
+ struct nfs_fsinfo *info,
+ struct nfs4_secinfo_flavors *flavors, bool use_integrity)
{
struct nfs41_secinfo_no_name_args args = {
.style = SECINFO_STYLE_CURRENT_FH,
@@ -7096,7 +7495,23 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_argp = &args,
.rpc_resp = &res,
};
- return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ struct rpc_clnt *clnt = server->client;
+ int status;
+
+ if (use_integrity) {
+ clnt = server->nfs_client->cl_rpcclient;
+ msg.rpc_cred = nfs4_get_clid_cred(server->nfs_client);
+ }
+
+ dprintk("--> %s\n", __func__);
+ status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
+ &res.seq_res, 0);
+ dprintk("<-- %s status=%d\n", __func__, status);
+
+ if (msg.rpc_cred)
+ put_rpccred(msg.rpc_cred);
+
+ return status;
}
static int
@@ -7106,7 +7521,24 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs4_exception exception = { };
int err;
do {
- err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
+ /* first try using integrity protection */
+ err = -NFS4ERR_WRONGSEC;
+
+ /* try to use integrity protection with machine cred */
+ if (_nfs4_is_integrity_protected(server->nfs_client))
+ err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ flavors, true);
+
+ /*
+ * if unable to use integrity protection, or SECINFO with
+ * integrity protection returns NFS4ERR_WRONGSEC (which is
+ * disallowed by spec, but exists in deployed servers) use
+ * the current filesystem's rpc_client and the user cred.
+ */
+ if (err == -NFS4ERR_WRONGSEC)
+ err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ flavors, false);
+
switch (err) {
case 0:
case -NFS4ERR_WRONGSEC:
@@ -7176,11 +7608,15 @@ static int _nfs41_test_stateid(struct nfs_server *server,
.rpc_resp = &res,
.rpc_cred = cred,
};
+ struct rpc_clnt *rpc_client = server->client;
+
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+ &rpc_client, &msg);
dprintk("NFS call test_stateid %p\n", stateid);
- nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
nfs4_set_sequence_privileged(&args.seq_args);
- status = nfs4_call_sync_sequence(server->client, server, &msg,
+ status = nfs4_call_sync_sequence(rpc_client, server, &msg,
&args.seq_args, &res.seq_res);
if (status != NFS_OK) {
dprintk("NFS reply test_stateid: failed, %d\n", status);
@@ -7249,7 +7685,7 @@ static void nfs41_free_stateid_release(void *calldata)
kfree(calldata);
}
-const struct rpc_call_ops nfs41_free_stateid_ops = {
+static const struct rpc_call_ops nfs41_free_stateid_ops = {
.rpc_call_prepare = nfs41_free_stateid_prepare,
.rpc_call_done = nfs41_free_stateid_done,
.rpc_release = nfs41_free_stateid_release,
@@ -7272,6 +7708,9 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
};
struct nfs_free_stateid_data *data;
+ nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID,
+ &task_setup.rpc_client, &msg);
+
dprintk("NFS call free_stateid %p\n", stateid);
data = kmalloc(sizeof(*data), GFP_NOFS);
if (!data)
@@ -7283,7 +7722,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
msg.rpc_argp = &data->args;
msg.rpc_resp = &data->res;
- nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
if (privileged)
nfs4_set_sequence_privileged(&data->args.seq_args);
@@ -7359,7 +7798,6 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
.recover_open = nfs4_open_reclaim,
.recover_lock = nfs4_lock_reclaim,
.establish_clid = nfs4_init_clientid,
- .get_clid_cred = nfs4_get_setclientid_cred,
.detect_trunking = nfs40_discover_server_trunking,
};
@@ -7370,7 +7808,6 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
.recover_open = nfs4_open_reclaim,
.recover_lock = nfs4_lock_reclaim,
.establish_clid = nfs41_init_clientid,
- .get_clid_cred = nfs4_get_exchange_id_cred,
.reclaim_complete = nfs41_proc_reclaim_complete,
.detect_trunking = nfs41_discover_server_trunking,
};
@@ -7382,7 +7819,6 @@ static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.recover_open = nfs4_open_expired,
.recover_lock = nfs4_lock_expired,
.establish_clid = nfs4_init_clientid,
- .get_clid_cred = nfs4_get_setclientid_cred,
};
#if defined(CONFIG_NFS_V4_1)
@@ -7392,7 +7828,6 @@ static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
.recover_open = nfs41_open_expired,
.recover_lock = nfs41_lock_expired,
.establish_clid = nfs41_init_clientid,
- .get_clid_cred = nfs4_get_exchange_id_cred,
};
#endif /* CONFIG_NFS_V4_1 */
@@ -7416,10 +7851,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
| NFS_CAP_ATOMIC_OPEN
| NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK,
- .call_sync = _nfs4_call_sync,
+ .init_client = nfs40_init_client,
+ .shutdown_client = nfs40_shutdown_client,
.match_stateid = nfs4_match_stateid,
.find_root_sec = nfs4_find_root_sec,
.free_lock_state = nfs4_release_lockowner,
+ .call_sync_ops = &nfs40_call_sync_ops,
.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
.state_renewal_ops = &nfs40_state_renewal_ops,
@@ -7434,10 +7871,12 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1,
- .call_sync = nfs4_call_sync_sequence,
+ .init_client = nfs41_init_client,
+ .shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
.find_root_sec = nfs41_find_root_sec,
.free_lock_state = nfs41_free_lock_state,
+ .call_sync_ops = &nfs41_call_sync_ops,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
@@ -7453,10 +7892,12 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1,
- .call_sync = nfs4_call_sync_sequence,
+ .init_client = nfs41_init_client,
+ .shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
.find_root_sec = nfs41_find_root_sec,
.free_lock_state = nfs41_free_lock_state,
+ .call_sync_ops = &nfs41_call_sync_ops,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
@@ -7473,7 +7914,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif
};
-const struct inode_operations nfs4_dir_inode_operations = {
+static const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
.atomic_open = nfs_atomic_open,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 36e21cb29d65..cf883c7ae053 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -23,6 +23,14 @@
#define NFSDBG_FACILITY NFSDBG_STATE
+static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
+{
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
+ spin_lock_init(&tbl->slot_tbl_lock);
+ rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
+ init_completion(&tbl->complete);
+}
+
/*
* nfs4_shrink_slot_table - free retired slots from the slot table
*/
@@ -44,6 +52,17 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
}
}
+/**
+ * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete
+ * @tbl - controlling slot table
+ *
+ */
+void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
+{
+ if (nfs4_slot_tbl_draining(tbl))
+ complete(&tbl->complete);
+}
+
/*
* nfs4_free_slot - free a slot and efficiently update slot table.
*
@@ -76,7 +95,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
nfs4_slot_tbl_drain_complete(tbl);
}
}
- dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
+ dprintk("%s: slotid %u highest_used_slotid %u\n", __func__,
slotid, tbl->highest_used_slotid);
}
@@ -146,9 +165,9 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
ret->generation = tbl->generation;
out:
- dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
+ dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
__func__, tbl->used_slots[0], tbl->highest_used_slotid,
- !IS_ERR(ret) ? ret->slot_nr : -1);
+ !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
return ret;
}
@@ -191,7 +210,7 @@ static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
{
int ret;
- dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
+ dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__,
max_reqs, tbl->max_slots);
if (max_reqs > NFS4_MAX_SLOT_TABLE)
@@ -205,18 +224,36 @@ static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
spin_unlock(&tbl->slot_tbl_lock);
- dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
+ dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__,
tbl, tbl->slots, tbl->max_slots);
out:
dprintk("<-- %s: return %d\n", __func__, ret);
return ret;
}
-/* Destroy the slot table */
-static void nfs4_destroy_slot_tables(struct nfs4_session *session)
+/**
+ * nfs4_release_slot_table - release resources attached to a slot table
+ * @tbl: slot table to shut down
+ *
+ */
+void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+{
+ nfs4_shrink_slot_table(tbl, 0);
+}
+
+/**
+ * nfs4_setup_slot_table - prepare a stand-alone slot table for use
+ * @tbl: slot table to set up
+ * @max_reqs: maximum number of requests allowed
+ * @queue: name to give RPC wait queue
+ *
+ * Returns zero on success, or a negative errno.
+ */
+int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs,
+ const char *queue)
{
- nfs4_shrink_slot_table(&session->fc_slot_table, 0);
- nfs4_shrink_slot_table(&session->bc_slot_table, 0);
+ nfs4_init_slot_table(tbl, queue);
+ return nfs4_realloc_slot_table(tbl, max_reqs, 0);
}
static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
@@ -273,6 +310,8 @@ void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
}
}
+#if defined(CONFIG_NFS_V4_1)
+
static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
u32 target_highest_slotid)
{
@@ -383,6 +422,12 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
spin_unlock(&tbl->slot_tbl_lock);
}
+static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+{
+ nfs4_release_slot_table(&session->fc_slot_table);
+ nfs4_release_slot_table(&session->bc_slot_table);
+}
+
/*
* Initialize or reset the forechannel and backchannel tables
*/
@@ -405,31 +450,20 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
if (status && tbl->slots == NULL)
/* Fore and back channel share a connection so get
* both slot tables or neither */
- nfs4_destroy_slot_tables(ses);
+ nfs4_destroy_session_slot_tables(ses);
return status;
}
struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
{
struct nfs4_session *session;
- struct nfs4_slot_table *tbl;
session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
if (!session)
return NULL;
- tbl = &session->fc_slot_table;
- tbl->highest_used_slotid = NFS4_NO_SLOT;
- spin_lock_init(&tbl->slot_tbl_lock);
- rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
- init_completion(&tbl->complete);
-
- tbl = &session->bc_slot_table;
- tbl->highest_used_slotid = NFS4_NO_SLOT;
- spin_lock_init(&tbl->slot_tbl_lock);
- rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
- init_completion(&tbl->complete);
-
+ nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table");
+ nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table");
session->session_state = 1<<NFS4_SESSION_INITING;
session->clp = clp;
@@ -441,7 +475,7 @@ void nfs4_destroy_session(struct nfs4_session *session)
struct rpc_xprt *xprt;
struct rpc_cred *cred;
- cred = nfs4_get_exchange_id_cred(session->clp);
+ cred = nfs4_get_clid_cred(session->clp);
nfs4_proc_destroy_session(session, cred);
if (cred)
put_rpccred(cred);
@@ -452,7 +486,7 @@ void nfs4_destroy_session(struct nfs4_session *session)
dprintk("%s Destroy backchannel for xprt %p\n",
__func__, xprt);
xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
- nfs4_destroy_slot_tables(session);
+ nfs4_destroy_session_slot_tables(session);
kfree(session);
}
@@ -513,4 +547,4 @@ int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
}
EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
-
+#endif /* defined(CONFIG_NFS_V4_1) */
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 3a153d82b90c..232306100651 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -8,7 +8,7 @@
#define __LINUX_FS_NFS_NFS4SESSION_H
/* maximum number of slots to use */
-#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
+#define NFS4_DEF_SLOT_TABLE_SIZE (64U)
#define NFS4_MAX_SLOT_TABLE (1024U)
#define NFS4_NO_SLOT ((u32)-1)
@@ -72,10 +72,22 @@ enum nfs4_session_state {
NFS4_SESSION_INITING,
};
-#if defined(CONFIG_NFS_V4_1)
+extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
+ unsigned int max_reqs, const char *queue);
+extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl);
extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
+extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
+bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot);
+void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
+
+static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
+{
+ return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
+}
+#if defined(CONFIG_NFS_V4_1)
extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
u32 target_highest_slotid);
extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
@@ -89,17 +101,6 @@ extern void nfs4_destroy_session(struct nfs4_session *session);
extern int nfs4_init_session(struct nfs_client *clp);
extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
-extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
-
-static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
-{
- return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
-}
-
-bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
- struct nfs4_slot *slot);
-void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
-
/*
* Determine if sessions are in use.
*/
@@ -117,6 +118,16 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
return 0;
}
+#ifdef CONFIG_CRC32
+/*
+ * nfs_session_id_hash - calculate the crc32 hash for the session id
+ * @session - pointer to session
+ */
+#define nfs_session_id_hash(sess_id) \
+ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
+#else
+#define nfs_session_id_hash(session) (0)
+#endif
#else /* defined(CONFIG_NFS_V4_1) */
static inline int nfs4_init_session(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e22862f13564..cc14cbb78b73 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -154,6 +154,19 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
return cred;
}
+static void nfs4_root_machine_cred(struct nfs_client *clp)
+{
+ struct rpc_cred *cred, *new;
+
+ new = rpc_lookup_machine_cred(NULL);
+ spin_lock(&clp->cl_lock);
+ cred = clp->cl_machine_cred;
+ clp->cl_machine_cred = new;
+ spin_unlock(&clp->cl_lock);
+ if (cred != NULL)
+ put_rpccred(cred);
+}
+
static struct rpc_cred *
nfs4_get_renew_cred_server_locked(struct nfs_server *server)
{
@@ -202,32 +215,6 @@ out:
return cred;
}
-#if defined(CONFIG_NFS_V4_1)
-
-static int nfs41_setup_state_renewal(struct nfs_client *clp)
-{
- int status;
- struct nfs_fsinfo fsinfo;
-
- if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
- nfs4_schedule_state_renewal(clp);
- return 0;
- }
-
- status = nfs4_proc_get_lease_time(clp, &fsinfo);
- if (status == 0) {
- /* Update lease time and schedule renewal */
- spin_lock(&clp->cl_lock);
- clp->cl_lease_time = fsinfo.lease_time * HZ;
- clp->cl_last_renewal = jiffies;
- spin_unlock(&clp->cl_lock);
-
- nfs4_schedule_state_renewal(clp);
- }
-
- return status;
-}
-
static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
{
if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
@@ -241,20 +228,18 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
{
struct nfs4_session *ses = clp->cl_session;
+ if (clp->cl_slot_tbl) {
+ nfs4_end_drain_slot_table(clp->cl_slot_tbl);
+ return;
+ }
+
if (ses != NULL) {
nfs4_end_drain_slot_table(&ses->bc_slot_table);
nfs4_end_drain_slot_table(&ses->fc_slot_table);
}
}
-/*
- * Signal state manager thread if session fore channel is drained
- */
-void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
-{
- if (nfs4_slot_tbl_draining(tbl))
- complete(&tbl->complete);
-}
+#if defined(CONFIG_NFS_V4_1)
static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
{
@@ -274,6 +259,9 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
struct nfs4_session *ses = clp->cl_session;
int ret = 0;
+ if (clp->cl_slot_tbl)
+ return nfs4_drain_slot_tbl(clp->cl_slot_tbl);
+
/* back channel */
ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
if (ret)
@@ -282,6 +270,30 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
return nfs4_drain_slot_tbl(&ses->fc_slot_table);
}
+static int nfs41_setup_state_renewal(struct nfs_client *clp)
+{
+ int status;
+ struct nfs_fsinfo fsinfo;
+
+ if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+ nfs4_schedule_state_renewal(clp);
+ return 0;
+ }
+
+ status = nfs4_proc_get_lease_time(clp, &fsinfo);
+ if (status == 0) {
+ /* Update lease time and schedule renewal */
+ spin_lock(&clp->cl_lock);
+ clp->cl_lease_time = fsinfo.lease_time * HZ;
+ clp->cl_last_renewal = jiffies;
+ spin_unlock(&clp->cl_lock);
+
+ nfs4_schedule_state_renewal(clp);
+ }
+
+ return status;
+}
+
static void nfs41_finish_session_reset(struct nfs_client *clp)
{
clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
@@ -339,62 +351,21 @@ int nfs41_discover_server_trunking(struct nfs_client *clp,
return nfs41_walk_client_list(clp, result, cred);
}
-struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp)
-{
- struct rpc_cred *cred;
-
- spin_lock(&clp->cl_lock);
- cred = nfs4_get_machine_cred_locked(clp);
- spin_unlock(&clp->cl_lock);
- return cred;
-}
-
#endif /* CONFIG_NFS_V4_1 */
-static struct rpc_cred *
-nfs4_get_setclientid_cred_server(struct nfs_server *server)
-{
- struct nfs_client *clp = server->nfs_client;
- struct rpc_cred *cred = NULL;
- struct nfs4_state_owner *sp;
- struct rb_node *pos;
-
- spin_lock(&clp->cl_lock);
- pos = rb_first(&server->state_owners);
- if (pos != NULL) {
- sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);
- cred = get_rpccred(sp->so_cred);
- }
- spin_unlock(&clp->cl_lock);
- return cred;
-}
-
/**
- * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation
+ * nfs4_get_clid_cred - Acquire credential for a setclientid operation
* @clp: client state handle
*
* Returns an rpc_cred with reference count bumped, or NULL.
*/
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp)
{
- struct nfs_server *server;
struct rpc_cred *cred;
spin_lock(&clp->cl_lock);
cred = nfs4_get_machine_cred_locked(clp);
spin_unlock(&clp->cl_lock);
- if (cred != NULL)
- goto out;
-
- rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- cred = nfs4_get_setclientid_cred_server(server);
- if (cred != NULL)
- break;
- }
- rcu_read_unlock();
-
-out:
return cred;
}
@@ -998,7 +969,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
fl_pid = lockowner->l_pid;
spin_lock(&state->state_lock);
lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
- if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
+ if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+ ret = -EIO;
+ else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
nfs4_stateid_copy(dst, &lsp->ls_stateid);
ret = 0;
smp_rmb();
@@ -1038,11 +1011,17 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
fmode_t fmode, const struct nfs_lockowner *lockowner)
{
- int ret = 0;
+ int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ if (ret == -EIO)
+ /* A lost lock - don't even consider delegations */
+ goto out;
if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
goto out;
- ret = nfs4_copy_lock_stateid(dst, state, lockowner);
if (ret != -ENOENT)
+ /* nfs4_copy_delegation_stateid() didn't over-write
+ * dst, so it still has the lock stateid which we now
+ * choose to use.
+ */
goto out;
ret = nfs4_copy_open_stateid(dst, state);
out:
@@ -1443,14 +1422,16 @@ restart:
if (status >= 0) {
status = nfs4_reclaim_locks(state, ops);
if (status >= 0) {
- spin_lock(&state->state_lock);
- list_for_each_entry(lock, &state->lock_states, ls_locks) {
- if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
- pr_warn_ratelimited("NFS: "
- "%s: Lock reclaim "
- "failed!\n", __func__);
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) {
+ spin_lock(&state->state_lock);
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+ pr_warn_ratelimited("NFS: "
+ "%s: Lock reclaim "
+ "failed!\n", __func__);
+ }
+ spin_unlock(&state->state_lock);
}
- spin_unlock(&state->state_lock);
nfs4_put_open_state(state);
spin_lock(&sp->so_lock);
goto restart;
@@ -1618,7 +1599,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
if (!nfs4_state_clear_reclaim_reboot(clp))
return;
ops = clp->cl_mvops->reboot_recovery_ops;
- cred = ops->get_clid_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
nfs4_reclaim_complete(clp, ops, cred);
put_rpccred(cred);
}
@@ -1732,7 +1713,7 @@ static int nfs4_check_lease(struct nfs_client *clp)
cred = ops->get_state_renewal_cred_locked(clp);
spin_unlock(&clp->cl_lock);
if (cred == NULL) {
- cred = nfs4_get_setclientid_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
status = -ENOKEY;
if (cred == NULL)
goto out;
@@ -1804,7 +1785,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
clp->cl_mvops->reboot_recovery_ops;
int status;
- cred = ops->get_clid_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
if (cred == NULL)
return -ENOENT;
status = ops->establish_clid(clp, cred);
@@ -1878,7 +1859,7 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
mutex_lock(&nfs_clid_init_mutex);
again:
status = -ENOENT;
- cred = ops->get_clid_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
if (cred == NULL)
goto out_unlock;
@@ -1896,7 +1877,11 @@ again:
__func__, status);
goto again;
case -EACCES:
- if (i++)
+ if (i++ == 0) {
+ nfs4_root_machine_cred(clp);
+ goto again;
+ }
+ if (i > 2)
break;
case -NFS4ERR_CLID_INUSE:
case -NFS4ERR_WRONGSEC:
@@ -2052,7 +2037,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
if (!nfs4_has_session(clp))
return 0;
nfs4_begin_drain_session(clp);
- cred = nfs4_get_exchange_id_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
status = nfs4_proc_destroy_session(clp->cl_session, cred);
switch (status) {
case 0:
@@ -2095,7 +2080,7 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
if (!nfs4_has_session(clp))
return 0;
nfs4_begin_drain_session(clp);
- cred = nfs4_get_exchange_id_cred(clp);
+ cred = nfs4_get_clid_cred(clp);
ret = nfs4_proc_bind_conn_to_session(clp, cred);
if (cred)
put_rpccred(cred);
@@ -2116,7 +2101,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
}
#else /* CONFIG_NFS_V4_1 */
static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
-static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
static int nfs4_bind_conn_to_session(struct nfs_client *clp)
{
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 5dbe2d269210..e26acdd1a645 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -253,8 +253,6 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
dfprintk(MOUNT, "--> nfs4_try_mount()\n");
- if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR)
- data->auth_flavors[0] = RPC_AUTH_UNIX;
export_path = data->nfs_server.export_path;
data->nfs_server.export_path = "/";
root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info,
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
new file mode 100644
index 000000000000..d774335cc8bc
--- /dev/null
+++ b/fs/nfs/nfs4trace.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+#include "internal.h"
+#include "nfs4session.h"
+#include "callback.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfs4trace.h"
+
+#ifdef CONFIG_NFS_V4_1
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+#endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
new file mode 100644
index 000000000000..849cf146db30
--- /dev/null
+++ b/fs/nfs/nfs4trace.h
@@ -0,0 +1,1148 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs4
+
+#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS4_H
+
+#include <linux/tracepoint.h>
+
+#define show_nfsv4_errors(error) \
+ __print_symbolic(error, \
+ { NFS4_OK, "OK" }, \
+ /* Mapped by nfs4_stat_to_errno() */ \
+ { -EPERM, "EPERM" }, \
+ { -ENOENT, "ENOENT" }, \
+ { -EIO, "EIO" }, \
+ { -ENXIO, "ENXIO" }, \
+ { -EACCES, "EACCES" }, \
+ { -EEXIST, "EEXIST" }, \
+ { -EXDEV, "EXDEV" }, \
+ { -ENOTDIR, "ENOTDIR" }, \
+ { -EISDIR, "EISDIR" }, \
+ { -EFBIG, "EFBIG" }, \
+ { -ENOSPC, "ENOSPC" }, \
+ { -EROFS, "EROFS" }, \
+ { -EMLINK, "EMLINK" }, \
+ { -ENAMETOOLONG, "ENAMETOOLONG" }, \
+ { -ENOTEMPTY, "ENOTEMPTY" }, \
+ { -EDQUOT, "EDQUOT" }, \
+ { -ESTALE, "ESTALE" }, \
+ { -EBADHANDLE, "EBADHANDLE" }, \
+ { -EBADCOOKIE, "EBADCOOKIE" }, \
+ { -ENOTSUPP, "ENOTSUPP" }, \
+ { -ETOOSMALL, "ETOOSMALL" }, \
+ { -EREMOTEIO, "EREMOTEIO" }, \
+ { -EBADTYPE, "EBADTYPE" }, \
+ { -EAGAIN, "EAGAIN" }, \
+ { -ELOOP, "ELOOP" }, \
+ { -EOPNOTSUPP, "EOPNOTSUPP" }, \
+ { -EDEADLK, "EDEADLK" }, \
+ /* RPC errors */ \
+ { -ENOMEM, "ENOMEM" }, \
+ { -EKEYEXPIRED, "EKEYEXPIRED" }, \
+ { -ETIMEDOUT, "ETIMEDOUT" }, \
+ { -ERESTARTSYS, "ERESTARTSYS" }, \
+ { -ECONNREFUSED, "ECONNREFUSED" }, \
+ { -ECONNRESET, "ECONNRESET" }, \
+ { -ENETUNREACH, "ENETUNREACH" }, \
+ { -EHOSTUNREACH, "EHOSTUNREACH" }, \
+ { -EHOSTDOWN, "EHOSTDOWN" }, \
+ { -EPIPE, "EPIPE" }, \
+ { -EPFNOSUPPORT, "EPFNOSUPPORT" }, \
+ { -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \
+ /* NFSv4 native errors */ \
+ { -NFS4ERR_ACCESS, "ACCESS" }, \
+ { -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \
+ { -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \
+ { -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \
+ { -NFS4ERR_BADCHAR, "BADCHAR" }, \
+ { -NFS4ERR_BADHANDLE, "BADHANDLE" }, \
+ { -NFS4ERR_BADIOMODE, "BADIOMODE" }, \
+ { -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \
+ { -NFS4ERR_BADLABEL, "BADLABEL" }, \
+ { -NFS4ERR_BADNAME, "BADNAME" }, \
+ { -NFS4ERR_BADOWNER, "BADOWNER" }, \
+ { -NFS4ERR_BADSESSION, "BADSESSION" }, \
+ { -NFS4ERR_BADSLOT, "BADSLOT" }, \
+ { -NFS4ERR_BADTYPE, "BADTYPE" }, \
+ { -NFS4ERR_BADXDR, "BADXDR" }, \
+ { -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \
+ { -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \
+ { -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \
+ { -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \
+ { -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \
+ { -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \
+ { -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+ { -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \
+ { -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \
+ { -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \
+ { -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \
+ "CONN_NOT_BOUND_TO_SESSION" }, \
+ { -NFS4ERR_DEADLOCK, "DEADLOCK" }, \
+ { -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \
+ { -NFS4ERR_DELAY, "DELAY" }, \
+ { -NFS4ERR_DELEG_ALREADY_WANTED, \
+ "DELEG_ALREADY_WANTED" }, \
+ { -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \
+ { -NFS4ERR_DENIED, "DENIED" }, \
+ { -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \
+ { -NFS4ERR_DQUOT, "DQUOT" }, \
+ { -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \
+ { -NFS4ERR_EXIST, "EXIST" }, \
+ { -NFS4ERR_EXPIRED, "EXPIRED" }, \
+ { -NFS4ERR_FBIG, "FBIG" }, \
+ { -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \
+ { -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \
+ { -NFS4ERR_GRACE, "GRACE" }, \
+ { -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \
+ { -NFS4ERR_INVAL, "INVAL" }, \
+ { -NFS4ERR_IO, "IO" }, \
+ { -NFS4ERR_ISDIR, "ISDIR" }, \
+ { -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \
+ { -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \
+ { -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \
+ { -NFS4ERR_LOCKED, "LOCKED" }, \
+ { -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \
+ { -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \
+ { -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \
+ { -NFS4ERR_MLINK, "MLINK" }, \
+ { -NFS4ERR_MOVED, "MOVED" }, \
+ { -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \
+ { -NFS4ERR_NOENT, "NOENT" }, \
+ { -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \
+ { -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \
+ { -NFS4ERR_NOSPC, "NOSPC" }, \
+ { -NFS4ERR_NOTDIR, "NOTDIR" }, \
+ { -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \
+ { -NFS4ERR_NOTSUPP, "NOTSUPP" }, \
+ { -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \
+ { -NFS4ERR_NOT_SAME, "NOT_SAME" }, \
+ { -NFS4ERR_NO_GRACE, "NO_GRACE" }, \
+ { -NFS4ERR_NXIO, "NXIO" }, \
+ { -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \
+ { -NFS4ERR_OPENMODE, "OPENMODE" }, \
+ { -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \
+ { -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \
+ { -NFS4ERR_PERM, "PERM" }, \
+ { -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \
+ { -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \
+ { -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \
+ { -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \
+ { -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \
+ { -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \
+ { -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \
+ { -NFS4ERR_REP_TOO_BIG_TO_CACHE, \
+ "REP_TOO_BIG_TO_CACHE" }, \
+ { -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \
+ { -NFS4ERR_RESOURCE, "RESOURCE" }, \
+ { -NFS4ERR_RESTOREFH, "RESTOREFH" }, \
+ { -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \
+ { -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \
+ { -NFS4ERR_ROFS, "ROFS" }, \
+ { -NFS4ERR_SAME, "SAME" }, \
+ { -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \
+ { -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \
+ { -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \
+ { -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \
+ { -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \
+ { -NFS4ERR_STALE, "STALE" }, \
+ { -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \
+ { -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \
+ { -NFS4ERR_SYMLINK, "SYMLINK" }, \
+ { -NFS4ERR_TOOSMALL, "TOOSMALL" }, \
+ { -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \
+ { -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \
+ { -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \
+ { -NFS4ERR_WRONGSEC, "WRONGSEC" }, \
+ { -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \
+ { -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \
+ { -NFS4ERR_XDEV, "XDEV" })
+
+#define show_open_flags(flags) \
+ __print_flags(flags, "|", \
+ { O_CREAT, "O_CREAT" }, \
+ { O_EXCL, "O_EXCL" }, \
+ { O_TRUNC, "O_TRUNC" }, \
+ { O_DIRECT, "O_DIRECT" })
+
+#define show_fmode_flags(mode) \
+ __print_flags(mode, "|", \
+ { ((__force unsigned long)FMODE_READ), "READ" }, \
+ { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
+ { ((__force unsigned long)FMODE_EXEC), "EXEC" })
+
+#define show_nfs_fattr_flags(valid) \
+ __print_flags((unsigned long)valid, "|", \
+ { NFS_ATTR_FATTR_TYPE, "TYPE" }, \
+ { NFS_ATTR_FATTR_MODE, "MODE" }, \
+ { NFS_ATTR_FATTR_NLINK, "NLINK" }, \
+ { NFS_ATTR_FATTR_OWNER, "OWNER" }, \
+ { NFS_ATTR_FATTR_GROUP, "GROUP" }, \
+ { NFS_ATTR_FATTR_RDEV, "RDEV" }, \
+ { NFS_ATTR_FATTR_SIZE, "SIZE" }, \
+ { NFS_ATTR_FATTR_FSID, "FSID" }, \
+ { NFS_ATTR_FATTR_FILEID, "FILEID" }, \
+ { NFS_ATTR_FATTR_ATIME, "ATIME" }, \
+ { NFS_ATTR_FATTR_MTIME, "MTIME" }, \
+ { NFS_ATTR_FATTR_CTIME, "CTIME" }, \
+ { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \
+ { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \
+ { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" })
+
+DECLARE_EVENT_CLASS(nfs4_clientid_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ int error
+ ),
+
+ TP_ARGS(clp, error),
+
+ TP_STRUCT__entry(
+ __string(dstaddr,
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR))
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __assign_str(dstaddr,
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR));
+ ),
+
+ TP_printk(
+ "error=%d (%s) dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ __get_str(dstaddr)
+ )
+);
+#define DEFINE_NFS4_CLIENTID_EVENT(name) \
+ DEFINE_EVENT(nfs4_clientid_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ int error \
+ ), \
+ TP_ARGS(clp, error))
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
+DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
+
+TRACE_EVENT(nfs4_setup_sequence,
+ TP_PROTO(
+ const struct nfs4_session *session,
+ const struct nfs4_sequence_args *args
+ ),
+ TP_ARGS(session, args),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_used_slotid)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_slot *sa_slot = args->sa_slot;
+ __entry->session = nfs_session_id_hash(&session->sess_id);
+ __entry->slot_nr = sa_slot->slot_nr;
+ __entry->seq_nr = sa_slot->seq_nr;
+ __entry->highest_used_slotid =
+ sa_slot->table->highest_used_slotid;
+ ),
+ TP_printk(
+ "session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_used_slotid=%u",
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_used_slotid
+ )
+);
+
+#define show_nfs4_sequence_status_flags(status) \
+ __print_flags((unsigned long)status, "|", \
+ { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \
+ { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \
+ "CB_GSS_CONTEXTS_EXPIRING" }, \
+ { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \
+ "CB_GSS_CONTEXTS_EXPIRED" }, \
+ { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \
+ "EXPIRED_ALL_STATE_REVOKED" }, \
+ { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \
+ "EXPIRED_SOME_STATE_REVOKED" }, \
+ { SEQ4_STATUS_ADMIN_STATE_REVOKED, \
+ "ADMIN_STATE_REVOKED" }, \
+ { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \
+ "RECALLABLE_STATE_REVOKED" }, \
+ { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \
+ { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \
+ "RESTART_RECLAIM_NEEDED" }, \
+ { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \
+ "CB_PATH_DOWN_SESSION" }, \
+ { SEQ4_STATUS_BACKCHANNEL_FAULT, \
+ "BACKCHANNEL_FAULT" })
+
+TRACE_EVENT(nfs4_sequence_done,
+ TP_PROTO(
+ const struct nfs4_session *session,
+ const struct nfs4_sequence_res *res
+ ),
+ TP_ARGS(session, res),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, target_highest_slotid)
+ __field(unsigned int, status_flags)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_slot *sr_slot = res->sr_slot;
+ __entry->session = nfs_session_id_hash(&session->sess_id);
+ __entry->slot_nr = sr_slot->slot_nr;
+ __entry->seq_nr = sr_slot->seq_nr;
+ __entry->highest_slotid = res->sr_highest_slotid;
+ __entry->target_highest_slotid =
+ res->sr_target_highest_slotid;
+ __entry->error = res->sr_status;
+ ),
+ TP_printk(
+ "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u target_highest_slotid=%u "
+ "status_flags=%u (%s)",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid,
+ __entry->target_highest_slotid,
+ __entry->status_flags,
+ show_nfs4_sequence_status_flags(__entry->status_flags)
+ )
+);
+
+struct cb_sequenceargs;
+struct cb_sequenceres;
+
+TRACE_EVENT(nfs4_cb_sequence,
+ TP_PROTO(
+ const struct cb_sequenceargs *args,
+ const struct cb_sequenceres *res,
+ __be32 status
+ ),
+ TP_ARGS(args, res, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, session)
+ __field(unsigned int, slot_nr)
+ __field(unsigned int, seq_nr)
+ __field(unsigned int, highest_slotid)
+ __field(unsigned int, cachethis)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->session = nfs_session_id_hash(&args->csa_sessionid);
+ __entry->slot_nr = args->csa_slotid;
+ __entry->seq_nr = args->csa_sequenceid;
+ __entry->highest_slotid = args->csa_highestslotid;
+ __entry->cachethis = args->csa_cachethis;
+ __entry->error = -be32_to_cpu(status);
+ ),
+
+ TP_printk(
+ "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u "
+ "highest_slotid=%u",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->session,
+ __entry->slot_nr,
+ __entry->seq_nr,
+ __entry->highest_slotid
+ )
+);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_open_event,
+ TP_PROTO(
+ const struct nfs_open_context *ctx,
+ int flags,
+ int error
+ ),
+
+ TP_ARGS(ctx, flags, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(unsigned int, flags)
+ __field(unsigned int, fmode)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ const struct nfs4_state *state = ctx->state;
+ const struct inode *inode = NULL;
+
+ __entry->error = error;
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned int)ctx->mode;
+ __entry->dev = ctx->dentry->d_sb->s_dev;
+ if (!IS_ERR(state))
+ inode = state->inode;
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ } else {
+ __entry->fileid = 0;
+ __entry->fhandle = 0;
+ }
+ __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode);
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%d (%s) flags=%d (%s) fmode=%s "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "name=%02x:%02x:%llu/%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS4_OPEN_EVENT(name) \
+ DEFINE_EVENT(nfs4_open_event, name, \
+ TP_PROTO( \
+ const struct nfs_open_context *ctx, \
+ int flags, \
+ int error \
+ ), \
+ TP_ARGS(ctx, flags, error))
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
+DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+
+TRACE_EVENT(nfs4_close,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs_closeargs *args,
+ const struct nfs_closeres *res,
+ int error
+ ),
+
+ TP_ARGS(state, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+#define show_lock_cmd(type) \
+ __print_symbolic((int)type, \
+ { F_GETLK, "GETLK" }, \
+ { F_SETLK, "SETLK" }, \
+ { F_SETLKW, "SETLKW" })
+#define show_lock_type(type) \
+ __print_symbolic((int)type, \
+ { F_RDLCK, "RDLCK" }, \
+ { F_WRLCK, "WRLCK" }, \
+ { F_UNLCK, "UNLCK" })
+
+DECLARE_EVENT_CLASS(nfs4_lock_event,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(int, cmd)
+ __field(char, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ ),
+
+ TP_printk(
+ "error=%d (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ show_lock_cmd(__entry->cmd),
+ show_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+#define DEFINE_NFS4_LOCK_EVENT(name) \
+ DEFINE_EVENT(nfs4_lock_event, name, \
+ TP_PROTO( \
+ const struct file_lock *request, \
+ const struct nfs4_state *state, \
+ int cmd, \
+ int error \
+ ), \
+ TP_ARGS(request, state, cmd, error))
+DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
+DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
+DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
+DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
+DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+
+DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
+ TP_PROTO(
+ const struct inode *inode,
+ fmode_t fmode
+ ),
+
+ TP_ARGS(inode, fmode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)fmode;
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x",
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \
+ DEFINE_EVENT(nfs4_set_delegation_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ fmode_t fmode \
+ ), \
+ TP_ARGS(inode, fmode))
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation);
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation);
+
+TRACE_EVENT(nfs4_delegreturn_exit,
+ TP_PROTO(
+ const struct nfs4_delegreturnargs *args,
+ const struct nfs4_delegreturnres *res,
+ int error
+ ),
+
+ TP_ARGS(args, res, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = res->server->s_dev;
+ __entry->fhandle = nfs_fhandle_hash(args->fhandle);
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fhandle
+ )
+);
+
+#ifdef CONFIG_NFS_V4_1
+DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
+ TP_PROTO(
+ const struct nfs4_state *state,
+ const struct nfs4_lock_state *lsp,
+ int error
+ ),
+
+ TP_ARGS(state, lsp, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_test_stateid_event, name, \
+ TP_PROTO( \
+ const struct nfs4_state *state, \
+ const struct nfs4_lock_state *lsp, \
+ int error \
+ ), \
+ TP_ARGS(state, lsp, error))
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid);
+DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_lookup_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct qstr *name,
+ int error
+ ),
+
+ TP_ARGS(dir, name, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, error)
+ __field(u64, dir)
+ __string(name, name->name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error;
+ __assign_str(name, name->name);
+ ),
+
+ TP_printk(
+ "error=%d (%s) name=%02x:%02x:%llu/%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS4_LOOKUP_EVENT(name) \
+ DEFINE_EVENT(nfs4_lookup_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct qstr *name, \
+ int error \
+ ), \
+ TP_ARGS(dir, name, error))
+
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations);
+DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo);
+
+TRACE_EVENT(nfs4_rename,
+ TP_PROTO(
+ const struct inode *olddir,
+ const struct qstr *oldname,
+ const struct inode *newdir,
+ const struct qstr *newname,
+ int error
+ ),
+
+ TP_ARGS(olddir, oldname, newdir, newname, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, error)
+ __field(u64, olddir)
+ __string(oldname, oldname->name)
+ __field(u64, newdir)
+ __string(newname, newname->name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = olddir->i_sb->s_dev;
+ __entry->olddir = NFS_FILEID(olddir);
+ __entry->newdir = NFS_FILEID(newdir);
+ __entry->error = error;
+ __assign_str(oldname, oldname->name);
+ __assign_str(newname, newname->name);
+ ),
+
+ TP_printk(
+ "error=%d (%s) oldname=%02x:%02x:%llu/%s "
+ "newname=%02x:%02x:%llu/%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->olddir,
+ __get_str(oldname),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->newdir,
+ __get_str(newname)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs4_inode_event,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle
+ )
+);
+
+#define DEFINE_NFS4_INODE_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(inode, error))
+
+DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_EVENT(nfs4_access);
+DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
+DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
+DEFINE_NFS4_INODE_EVENT(nfs4_get_acl);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
+DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
+DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_getattr_event,
+ TP_PROTO(
+ const struct nfs_server *server,
+ const struct nfs_fh *fhandle,
+ const struct nfs_fattr *fattr,
+ int error
+ ),
+
+ TP_ARGS(server, fhandle, fattr, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, valid)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = server->s_dev;
+ __entry->valid = fattr->valid;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "valid=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_nfs_fattr_flags(__entry->valid)
+ )
+);
+
+#define DEFINE_NFS4_GETATTR_EVENT(name) \
+ DEFINE_EVENT(nfs4_getattr_event, name, \
+ TP_PROTO( \
+ const struct nfs_server *server, \
+ const struct nfs_fh *fhandle, \
+ const struct nfs_fattr *fattr, \
+ int error \
+ ), \
+ TP_ARGS(server, fhandle, fattr, error))
+DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
+DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
+
+DECLARE_EVENT_CLASS(nfs4_idmap_event,
+ TP_PROTO(
+ const char *name,
+ int len,
+ u32 id,
+ int error
+ ),
+
+ TP_ARGS(name, len, id, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(u32, id)
+ __dynamic_array(char, name, len > 0 ? len + 1 : 1)
+ ),
+
+ TP_fast_assign(
+ if (len < 0)
+ len = 0;
+ __entry->error = error < 0 ? error : 0;
+ __entry->id = id;
+ memcpy(__get_dynamic_array(name), name, len);
+ ((char *)__get_dynamic_array(name))[len] = 0;
+ ),
+
+ TP_printk(
+ "error=%d id=%u name=%s",
+ __entry->error,
+ __entry->id,
+ __get_str(name)
+ )
+);
+#define DEFINE_NFS4_IDMAP_EVENT(name) \
+ DEFINE_EVENT(nfs4_idmap_event, name, \
+ TP_PROTO( \
+ const char *name, \
+ int len, \
+ u32 id, \
+ int error \
+ ), \
+ TP_ARGS(name, len, id, error))
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name);
+DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
+
+DECLARE_EVENT_CLASS(nfs4_read_event,
+ TP_PROTO(
+ const struct nfs_read_data *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->header->inode;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zu",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->count
+ )
+);
+#define DEFINE_NFS4_READ_EVENT(name) \
+ DEFINE_EVENT(nfs4_read_event, name, \
+ TP_PROTO( \
+ const struct nfs_read_data *data, \
+ int error \
+ ), \
+ TP_ARGS(data, error))
+DEFINE_NFS4_READ_EVENT(nfs4_read);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_write_event,
+ TP_PROTO(
+ const struct nfs_write_data *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->header->inode;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zu",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->count
+ )
+);
+
+#define DEFINE_NFS4_WRITE_EVENT(name) \
+ DEFINE_EVENT(nfs4_write_event, name, \
+ TP_PROTO( \
+ const struct nfs_write_data *data, \
+ int error \
+ ), \
+ TP_ARGS(data, error))
+DEFINE_NFS4_WRITE_EVENT(nfs4_write);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
+#endif /* CONFIG_NFS_V4_1 */
+
+DECLARE_EVENT_CLASS(nfs4_commit_event,
+ TP_PROTO(
+ const struct nfs_commit_data *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = data->inode;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->offset = data->args.offset;
+ __entry->count = data->args.count;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zu",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (long long)__entry->offset,
+ __entry->count
+ )
+);
+#define DEFINE_NFS4_COMMIT_EVENT(name) \
+ DEFINE_EVENT(nfs4_commit_event, name, \
+ TP_PROTO( \
+ const struct nfs_commit_data *data, \
+ int error \
+ ), \
+ TP_ARGS(data, error))
+DEFINE_NFS4_COMMIT_EVENT(nfs4_commit);
+#ifdef CONFIG_NFS_V4_1
+DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds);
+
+#define show_pnfs_iomode(iomode) \
+ __print_symbolic(iomode, \
+ { IOMODE_READ, "READ" }, \
+ { IOMODE_RW, "RW" }, \
+ { IOMODE_ANY, "ANY" })
+
+TRACE_EVENT(nfs4_layoutget,
+ TP_PROTO(
+ const struct nfs_open_context *ctx,
+ const struct pnfs_layout_range *args,
+ const struct pnfs_layout_range *res,
+ int error
+ ),
+
+ TP_ARGS(ctx, args, res, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u32, iomode)
+ __field(u64, offset)
+ __field(u64, count)
+ __field(int, error)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = ctx->dentry->d_inode;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->iomode = args->iomode;
+ __entry->offset = args->offset;
+ __entry->count = args->length;
+ __entry->error = error;
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s offset=%llu count=%llu",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->offset,
+ (unsigned long long)__entry->count
+ )
+);
+
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* _TRACE_NFS4_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfs4trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0abfb8466e79..fbdad9e1719f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -294,7 +294,9 @@ static int nfs4_stat_to_errno(int);
XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
1 /* flags */ + \
1 /* spa_how */ + \
- 0 /* SP4_NONE (for now) */ + \
+ /* max is SP4_MACH_CRED (for now) */ + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
1 /* implementation id array of size 1 */ + \
1 /* nii_domain */ + \
XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
@@ -306,7 +308,9 @@ static int nfs4_stat_to_errno(int);
1 /* eir_sequenceid */ + \
1 /* eir_flags */ + \
1 /* spr_how */ + \
- 0 /* SP4_NONE (for now) */ + \
+ /* max is SP4_MACH_CRED (for now) */ + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
+ 1 + NFS4_OP_MAP_NUM_WORDS + \
2 /* eir_server_owner.so_minor_id */ + \
/* eir_server_owner.so_major_id<> */ \
XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
@@ -997,11 +1001,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
int owner_namelen = 0;
int owner_grouplen = 0;
__be32 *p;
- __be32 *q;
- int len;
- uint32_t bmval0 = 0;
- uint32_t bmval1 = 0;
- uint32_t bmval2 = 0;
+ unsigned i;
+ uint32_t len = 0;
+ uint32_t bmval_len;
+ uint32_t bmval[3] = { 0 };
/*
* We reserve enough space to write the entire attribute buffer at once.
@@ -1010,13 +1013,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
* = 40 bytes, plus any contribution from variable-length fields
* such as owner/group.
*/
- len = 20;
-
- /* Sigh */
- if (iap->ia_valid & ATTR_SIZE)
+ if (iap->ia_valid & ATTR_SIZE) {
+ bmval[0] |= FATTR4_WORD0_SIZE;
len += 8;
- if (iap->ia_valid & ATTR_MODE)
+ }
+ if (iap->ia_valid & ATTR_MODE) {
+ bmval[1] |= FATTR4_WORD1_MODE;
len += 4;
+ }
if (iap->ia_valid & ATTR_UID) {
owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
if (owner_namelen < 0) {
@@ -1027,6 +1031,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
owner_namelen = sizeof("nobody") - 1;
/* goto out; */
}
+ bmval[1] |= FATTR4_WORD1_OWNER;
len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
}
if (iap->ia_valid & ATTR_GID) {
@@ -1038,86 +1043,73 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
owner_grouplen = sizeof("nobody") - 1;
/* goto out; */
}
+ bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
}
- if (label)
- len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
- if (iap->ia_valid & ATTR_ATIME_SET)
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
len += 16;
- else if (iap->ia_valid & ATTR_ATIME)
+ } else if (iap->ia_valid & ATTR_ATIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
len += 4;
- if (iap->ia_valid & ATTR_MTIME_SET)
+ }
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 16;
- else if (iap->ia_valid & ATTR_MTIME)
+ } else if (iap->ia_valid & ATTR_MTIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 4;
- p = reserve_space(xdr, len);
+ }
+ if (label) {
+ len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
+ bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
+ }
- /*
- * We write the bitmap length now, but leave the bitmap and the attribute
- * buffer length to be backfilled at the end of this routine.
- */
- *p++ = cpu_to_be32(3);
- q = p;
- p += 4;
+ if (bmval[2] != 0)
+ bmval_len = 3;
+ else if (bmval[1] != 0)
+ bmval_len = 2;
+ else
+ bmval_len = 1;
- if (iap->ia_valid & ATTR_SIZE) {
- bmval0 |= FATTR4_WORD0_SIZE;
+ p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len);
+
+ *p++ = cpu_to_be32(bmval_len);
+ for (i = 0; i < bmval_len; i++)
+ *p++ = cpu_to_be32(bmval[i]);
+ *p++ = cpu_to_be32(len);
+
+ if (bmval[0] & FATTR4_WORD0_SIZE)
p = xdr_encode_hyper(p, iap->ia_size);
- }
- if (iap->ia_valid & ATTR_MODE) {
- bmval1 |= FATTR4_WORD1_MODE;
+ if (bmval[1] & FATTR4_WORD1_MODE)
*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
- }
- if (iap->ia_valid & ATTR_UID) {
- bmval1 |= FATTR4_WORD1_OWNER;
+ if (bmval[1] & FATTR4_WORD1_OWNER)
p = xdr_encode_opaque(p, owner_name, owner_namelen);
- }
- if (iap->ia_valid & ATTR_GID) {
- bmval1 |= FATTR4_WORD1_OWNER_GROUP;
+ if (bmval[1] & FATTR4_WORD1_OWNER_GROUP)
p = xdr_encode_opaque(p, owner_group, owner_grouplen);
+ if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
- if (iap->ia_valid & ATTR_ATIME_SET) {
- bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
- }
- else if (iap->ia_valid & ATTR_ATIME) {
- bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
- }
- if (iap->ia_valid & ATTR_MTIME_SET) {
- bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
- p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
- *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
- }
- else if (iap->ia_valid & ATTR_MTIME) {
- bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
+ if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+ } else
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
- if (label) {
- bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
+ if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
*p++ = cpu_to_be32(label->lfs);
*p++ = cpu_to_be32(label->pi);
*p++ = cpu_to_be32(label->len);
p = xdr_encode_opaque_fixed(p, label->label, label->len);
}
- /*
- * Now we backfill the bitmap and the attribute buffer length.
- */
- if (len != ((char *)p - (char *)q) + 4) {
- printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n",
- len, ((char *)p - (char *)q) + 4);
- BUG();
- }
- len = (char *)p - (char *)q - 16;
- *q++ = htonl(bmval0);
- *q++ = htonl(bmval1);
- *q++ = htonl(bmval2);
- *q = htonl(len);
-
/* out: */
}
@@ -1738,6 +1730,14 @@ static void encode_bind_conn_to_session(struct xdr_stream *xdr,
*p = 0; /* use_conn_in_rdma_mode = False */
}
+static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
+{
+ unsigned int i;
+ encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS);
+ for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++)
+ encode_uint32(xdr, op_map->u.words[i]);
+}
+
static void encode_exchange_id(struct xdr_stream *xdr,
struct nfs41_exchange_id_args *args,
struct compound_hdr *hdr)
@@ -1751,9 +1751,20 @@ static void encode_exchange_id(struct xdr_stream *xdr,
encode_string(xdr, args->id_len, args->id);
- p = reserve_space(xdr, 12);
- *p++ = cpu_to_be32(args->flags);
- *p++ = cpu_to_be32(0); /* zero length state_protect4_a */
+ encode_uint32(xdr, args->flags);
+ encode_uint32(xdr, args->state_protect.how);
+
+ switch (args->state_protect.how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ encode_op_map(xdr, &args->state_protect.enforce);
+ encode_op_map(xdr, &args->state_protect.allow);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
if (send_implementation_id &&
sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
@@ -1764,7 +1775,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
utsname()->version, utsname()->machine);
if (len > 0) {
- *p = cpu_to_be32(1); /* implementation id array length=1 */
+ encode_uint32(xdr, 1); /* implementation id array length=1 */
encode_string(xdr,
sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
@@ -1775,7 +1786,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, 0);
*p = cpu_to_be32(0);
} else
- *p = cpu_to_be32(0); /* implementation id array length=0 */
+ encode_uint32(xdr, 0); /* implementation id array length=0 */
}
static void encode_create_session(struct xdr_stream *xdr,
@@ -1828,7 +1839,7 @@ static void encode_create_session(struct xdr_stream *xdr,
*p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
/* authsys_parms rfc1831 */
- *p++ = (__be32)nn->boot_time.tv_nsec; /* stamp */
+ *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */
p = xdr_encode_opaque(p, machine_name, len);
*p++ = cpu_to_be32(0); /* UID */
*p++ = cpu_to_be32(0); /* GID */
@@ -1870,11 +1881,10 @@ static void encode_sequence(struct xdr_stream *xdr,
struct nfs4_slot *slot = args->sa_slot;
__be32 *p;
- if (slot == NULL)
- return;
-
tp = slot->table;
session = tp->session;
+ if (!session)
+ return;
encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
@@ -2055,9 +2065,9 @@ static void encode_free_stateid(struct xdr_stream *xdr,
static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
{
#if defined(CONFIG_NFS_V4_1)
-
- if (args->sa_slot)
- return args->sa_slot->table->session->clp->cl_mvops->minor_version;
+ struct nfs4_session *session = args->sa_slot->table->session;
+ if (session)
+ return session->clp->cl_mvops->minor_version;
#endif /* CONFIG_NFS_V4_1 */
return 0;
}
@@ -4642,7 +4652,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
uint32_t *layouttype)
{
- uint32_t *p;
+ __be32 *p;
int num;
p = xdr_inline_decode(xdr, 4);
@@ -5387,6 +5397,23 @@ static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_re
return decode_secinfo_common(xdr, res);
}
+static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
+{
+ __be32 *p;
+ uint32_t bitmap_words;
+ unsigned int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ bitmap_words = be32_to_cpup(p++);
+ if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
+ return -EIO;
+ p = xdr_inline_decode(xdr, 4 * bitmap_words);
+ for (i = 0; i < bitmap_words; i++)
+ op_map->u.words[i] = be32_to_cpup(p++);
+
+ return 0;
+}
+
static int decode_exchange_id(struct xdr_stream *xdr,
struct nfs41_exchange_id_res *res)
{
@@ -5410,10 +5437,22 @@ static int decode_exchange_id(struct xdr_stream *xdr,
res->seqid = be32_to_cpup(p++);
res->flags = be32_to_cpup(p++);
- /* We ask for SP4_NONE */
- dummy = be32_to_cpup(p);
- if (dummy != SP4_NONE)
+ res->state_protect.how = be32_to_cpup(p);
+ switch (res->state_protect.how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+ status = decode_op_map(xdr, &res->state_protect.enforce);
+ if (status)
+ return status;
+ status = decode_op_map(xdr, &res->state_protect.allow);
+ if (status)
+ return status;
+ break;
+ default:
+ WARN_ON_ONCE(1);
return -EIO;
+ }
/* server_owner4.so_minor_id */
p = xdr_inline_decode(xdr, 8);
@@ -5607,6 +5646,8 @@ static int decode_sequence(struct xdr_stream *xdr,
if (res->sr_slot == NULL)
return 0;
+ if (!res->sr_slot->table->session)
+ return 0;
status = decode_op_hdr(xdr, OP_SEQUENCE);
if (!status)
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
new file mode 100644
index 000000000000..4eb0aead69b6
--- /dev/null
+++ b/fs/nfs/nfstrace.c
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/namei.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include "nfstrace.h"
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
new file mode 100644
index 000000000000..89fe741e58b1
--- /dev/null
+++ b/fs/nfs/nfstrace.h
@@ -0,0 +1,729 @@
+/*
+ * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfs
+
+#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NFS_H
+
+#include <linux/tracepoint.h>
+
+#define nfs_show_file_type(ftype) \
+ __print_symbolic(ftype, \
+ { DT_UNKNOWN, "UNKNOWN" }, \
+ { DT_FIFO, "FIFO" }, \
+ { DT_CHR, "CHR" }, \
+ { DT_DIR, "DIR" }, \
+ { DT_BLK, "BLK" }, \
+ { DT_REG, "REG" }, \
+ { DT_LNK, "LNK" }, \
+ { DT_SOCK, "SOCK" }, \
+ { DT_WHT, "WHT" })
+
+#define nfs_show_cache_validity(v) \
+ __print_flags(v, "|", \
+ { NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \
+ { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \
+ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
+ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
+ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
+ { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
+ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
+ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" })
+
+#define nfs_show_nfsi_flags(v) \
+ __print_flags(v, "|", \
+ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
+ { 1 << NFS_INO_STALE, "STALE" }, \
+ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
+ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
+ { 1 << NFS_INO_COMMIT, "COMMIT" }, \
+ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
+ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
+
+DECLARE_EVENT_CLASS(nfs_inode_event,
+ TP_PROTO(
+ const struct inode *inode
+ ),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(u64, version)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->version = inode->i_version;
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ (unsigned long long)__entry->version
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_inode_event_done,
+ TP_PROTO(
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(inode, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(unsigned char, type)
+ __field(u64, fileid)
+ __field(u64, version)
+ __field(loff_t, size)
+ __field(unsigned long, nfsi_flags)
+ __field(unsigned long, cache_validity)
+ ),
+
+ TP_fast_assign(
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->error = error;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->type = nfs_umode_to_dtype(inode->i_mode);
+ __entry->version = inode->i_version;
+ __entry->size = i_size_read(inode);
+ __entry->nfsi_flags = nfsi->flags;
+ __entry->cache_validity = nfsi->cache_validity;
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "type=%u (%s) version=%llu size=%lld "
+ "cache_validity=%lu (%s) nfs_flags=%ld (%s)",
+ __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->type,
+ nfs_show_file_type(__entry->type),
+ (unsigned long long)__entry->version,
+ (long long)__entry->size,
+ __entry->cache_validity,
+ nfs_show_cache_validity(__entry->cache_validity),
+ __entry->nfsi_flags,
+ nfs_show_nfsi_flags(__entry->nfsi_flags)
+ )
+);
+
+#define DEFINE_NFS_INODE_EVENT(name) \
+ DEFINE_EVENT(nfs_inode_event, name, \
+ TP_PROTO( \
+ const struct inode *inode \
+ ), \
+ TP_ARGS(inode))
+#define DEFINE_NFS_INODE_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_inode_event_done, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit);
+DEFINE_NFS_INODE_EVENT(nfs_getattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_setattr_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit);
+DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit);
+DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit);
+DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
+DEFINE_NFS_INODE_EVENT(nfs_access_enter);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit);
+
+#define show_lookup_flags(flags) \
+ __print_flags((unsigned long)flags, "|", \
+ { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \
+ { LOOKUP_DIRECTORY, "DIRECTORY" }, \
+ { LOOKUP_OPEN, "OPEN" }, \
+ { LOOKUP_CREATE, "CREATE" }, \
+ { LOOKUP_EXCL, "EXCL" })
+
+DECLARE_EVENT_CLASS(nfs_lookup_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, dentry, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=%u (%s) name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_lookup_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT(name) \
+ DEFINE_EVENT(nfs_lookup_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ unsigned int flags \
+ ), \
+ TP_ARGS(dir, dentry, flags))
+
+DECLARE_EVENT_CLASS(nfs_lookup_event_done,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, flags, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(unsigned int, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error;
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
+ __entry->error,
+ __entry->flags,
+ show_lookup_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_lookup_event_done, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ unsigned int flags, \
+ int error \
+ ), \
+ TP_ARGS(dir, dentry, flags, error))
+
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
+DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
+
+#define show_open_flags(flags) \
+ __print_flags((unsigned long)flags, "|", \
+ { O_CREAT, "O_CREAT" }, \
+ { O_EXCL, "O_EXCL" }, \
+ { O_TRUNC, "O_TRUNC" }, \
+ { O_APPEND, "O_APPEND" }, \
+ { O_DSYNC, "O_DSYNC" }, \
+ { O_DIRECT, "O_DIRECT" }, \
+ { O_DIRECTORY, "O_DIRECTORY" })
+
+#define show_fmode_flags(mode) \
+ __print_flags(mode, "|", \
+ { ((__force unsigned long)FMODE_READ), "READ" }, \
+ { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \
+ { ((__force unsigned long)FMODE_EXEC), "EXEC" })
+
+TRACE_EVENT(nfs_atomic_open_enter,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct nfs_open_context *ctx,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, ctx, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, flags)
+ __field(unsigned int, fmode)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned int)ctx->mode;
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_atomic_open_exit,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct nfs_open_context *ctx,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, ctx, flags, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(unsigned int, flags)
+ __field(unsigned int, fmode)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, ctx->dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __entry->fmode = (__force unsigned int)ctx->mode;
+ __assign_str(name, ctx->dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%d flags=%u (%s) fmode=%s "
+ "name=%02x:%02x:%llu/%s",
+ __entry->error,
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ show_fmode_flags(__entry->fmode),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_create_enter,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags
+ ),
+
+ TP_ARGS(dir, dentry, flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "flags=%u (%s) name=%02x:%02x:%llu/%s",
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_create_exit,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ unsigned int flags,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, flags, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(unsigned int, flags)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->flags = flags;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s",
+ __entry->error,
+ __entry->flags,
+ show_open_flags(__entry->flags),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_directory_event,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry
+ ),
+
+ TP_ARGS(dir, dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT(name) \
+ DEFINE_EVENT(nfs_directory_event, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry \
+ ), \
+ TP_ARGS(dir, dentry))
+
+DECLARE_EVENT_CLASS(nfs_directory_event_done,
+ TP_PROTO(
+ const struct inode *dir,
+ const struct dentry *dentry,
+ int error
+ ),
+
+ TP_ARGS(dir, dentry, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%d name=%02x:%02x:%llu/%s",
+ __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_directory_event_done, name, \
+ TP_PROTO( \
+ const struct inode *dir, \
+ const struct dentry *dentry, \
+ int error \
+ ), \
+ TP_ARGS(dir, dentry, error))
+
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit);
+DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter);
+DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit);
+
+TRACE_EVENT(nfs_link_enter,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct inode *dir,
+ const struct dentry *dentry
+ ),
+
+ TP_ARGS(inode, dir, dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dir = NFS_FILEID(dir);
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fileid,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfs_link_exit,
+ TP_PROTO(
+ const struct inode *inode,
+ const struct inode *dir,
+ const struct dentry *dentry,
+ int error
+ ),
+
+ TP_ARGS(inode, dir, dentry, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u64, dir)
+ __string(name, dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error;
+ __assign_str(name, dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s",
+ __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->fileid,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_rename_event,
+ TP_PROTO(
+ const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry
+ ),
+
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, old_dir)
+ __field(u64, new_dir)
+ __string(old_name, old_dentry->d_name.name)
+ __string(new_name, new_dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = old_dir->i_sb->s_dev;
+ __entry->old_dir = NFS_FILEID(old_dir);
+ __entry->new_dir = NFS_FILEID(new_dir);
+ __assign_str(old_name, old_dentry->d_name.name);
+ __assign_str(new_name, new_dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->old_dir,
+ __get_str(old_name),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->new_dir,
+ __get_str(new_name)
+ )
+);
+#define DEFINE_NFS_RENAME_EVENT(name) \
+ DEFINE_EVENT(nfs_rename_event, name, \
+ TP_PROTO( \
+ const struct inode *old_dir, \
+ const struct dentry *old_dentry, \
+ const struct inode *new_dir, \
+ const struct dentry *new_dentry \
+ ), \
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry))
+
+DECLARE_EVENT_CLASS(nfs_rename_event_done,
+ TP_PROTO(
+ const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry,
+ int error
+ ),
+
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, error)
+ __field(u64, old_dir)
+ __string(old_name, old_dentry->d_name.name)
+ __field(u64, new_dir)
+ __string(new_name, new_dentry->d_name.name)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = old_dir->i_sb->s_dev;
+ __entry->old_dir = NFS_FILEID(old_dir);
+ __entry->new_dir = NFS_FILEID(new_dir);
+ __entry->error = error;
+ __assign_str(old_name, old_dentry->d_name.name);
+ __assign_str(new_name, new_dentry->d_name.name);
+ ),
+
+ TP_printk(
+ "error=%d old_name=%02x:%02x:%llu/%s "
+ "new_name=%02x:%02x:%llu/%s",
+ __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->old_dir,
+ __get_str(old_name),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->new_dir,
+ __get_str(new_name)
+ )
+);
+#define DEFINE_NFS_RENAME_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_rename_event_done, name, \
+ TP_PROTO( \
+ const struct inode *old_dir, \
+ const struct dentry *old_dentry, \
+ const struct inode *new_dir, \
+ const struct dentry *new_dentry, \
+ int error \
+ ), \
+ TP_ARGS(old_dir, old_dentry, new_dir, \
+ new_dentry, error))
+
+DEFINE_NFS_RENAME_EVENT(nfs_rename_enter);
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit);
+
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename);
+
+TRACE_EVENT(nfs_sillyrename_unlink,
+ TP_PROTO(
+ const struct nfs_unlinkdata *data,
+ int error
+ ),
+
+ TP_ARGS(data, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, error)
+ __field(u64, dir)
+ __dynamic_array(char, name, data->args.name.len + 1)
+ ),
+
+ TP_fast_assign(
+ struct inode *dir = data->dir;
+ size_t len = data->args.name.len;
+ __entry->dev = dir->i_sb->s_dev;
+ __entry->dir = NFS_FILEID(dir);
+ __entry->error = error;
+ memcpy(__get_dynamic_array(name),
+ data->args.name.name, len);
+ ((char *)__get_dynamic_array(name))[len] = 0;
+ ),
+
+ TP_printk(
+ "error=%d name=%02x:%02x:%llu/%s",
+ __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->dir,
+ __get_str(name)
+ )
+);
+#endif /* _TRACE_NFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE nfstrace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29cfb7ade121..2ffebf2081ce 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -328,6 +328,19 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
}
EXPORT_SYMBOL_GPL(nfs_pageio_init);
+static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+ const struct nfs_open_context *ctx2)
+{
+ return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
+static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
+ const struct nfs_lock_context *l2)
+{
+ return l1->lockowner.l_owner == l2->lockowner.l_owner
+ && l1->lockowner.l_pid == l2->lockowner.l_pid;
+}
+
/**
* nfs_can_coalesce_requests - test two requests for compatibility
* @prev: pointer to nfs_page
@@ -343,13 +356,10 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
struct nfs_page *req,
struct nfs_pageio_descriptor *pgio)
{
- if (req->wb_context->cred != prev->wb_context->cred)
- return false;
- if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner)
- return false;
- if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid)
+ if (!nfs_match_open_context(req->wb_context, prev->wb_context))
return false;
- if (req->wb_context->state != prev->wb_context->state)
+ if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+ !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context))
return false;
if (req->wb_pgbase != 0)
return false;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3a3a79d6bf15..d75d938d36cb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -33,6 +33,7 @@
#include "internal.h"
#include "pnfs.h"
#include "iostat.h"
+#include "nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -1526,6 +1527,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
{
struct nfs_pgio_header *hdr = data->header;
+ trace_nfs4_pnfs_write(data, hdr->pnfs_error);
if (!hdr->pnfs_error) {
pnfs_set_layoutcommit(data);
hdr->mds_ops->rpc_call_done(&data->task, data);
@@ -1680,6 +1682,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
{
struct nfs_pgio_header *hdr = data->header;
+ trace_nfs4_pnfs_read(data, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
__nfs4_read_done_cb(data);
hdr->mds_ops->rpc_call_done(&data->task, data);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c041c41f7a52..a8f57c728df5 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
}
-static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
{
rpc_call_start(task);
+ return 0;
}
static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
}
-static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
{
rpc_call_start(task);
+ return 0;
}
static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 70a26c651f09..31db5c366b81 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata)
void nfs_read_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_read_data *data = calldata;
- NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
- rpc_exit(task, -EIO);
+ int err;
+ err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
+ if (err)
+ rpc_exit(task, err);
}
static const struct rpc_call_ops nfs_read_common_ops = {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 71fdc0dfa0d2..5793f24613c8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -923,7 +923,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
data->nfs_server.port = NFS_UNSPEC_PORT;
data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR;
- data->auth_flavor_len = 1;
+ data->auth_flavor_len = 0;
data->minorversion = 0;
data->need_mount = true;
data->net = current->nsproxy->net_ns;
@@ -1018,6 +1018,13 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
}
}
+static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data,
+ rpc_authflavor_t pseudoflavor)
+{
+ data->auth_flavors[0] = pseudoflavor;
+ data->auth_flavor_len = 1;
+}
+
/*
* Parse the value of the 'sec=' option.
*/
@@ -1025,49 +1032,50 @@ static int nfs_parse_security_flavors(char *value,
struct nfs_parsed_mount_data *mnt)
{
substring_t args[MAX_OPT_ARGS];
+ rpc_authflavor_t pseudoflavor;
dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
switch (match_token(value, nfs_secflavor_tokens, args)) {
case Opt_sec_none:
- mnt->auth_flavors[0] = RPC_AUTH_NULL;
+ pseudoflavor = RPC_AUTH_NULL;
break;
case Opt_sec_sys:
- mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+ pseudoflavor = RPC_AUTH_UNIX;
break;
case Opt_sec_krb5:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+ pseudoflavor = RPC_AUTH_GSS_KRB5;
break;
case Opt_sec_krb5i:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+ pseudoflavor = RPC_AUTH_GSS_KRB5I;
break;
case Opt_sec_krb5p:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+ pseudoflavor = RPC_AUTH_GSS_KRB5P;
break;
case Opt_sec_lkey:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+ pseudoflavor = RPC_AUTH_GSS_LKEY;
break;
case Opt_sec_lkeyi:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+ pseudoflavor = RPC_AUTH_GSS_LKEYI;
break;
case Opt_sec_lkeyp:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+ pseudoflavor = RPC_AUTH_GSS_LKEYP;
break;
case Opt_sec_spkm:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+ pseudoflavor = RPC_AUTH_GSS_SPKM;
break;
case Opt_sec_spkmi:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+ pseudoflavor = RPC_AUTH_GSS_SPKMI;
break;
case Opt_sec_spkmp:
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+ pseudoflavor = RPC_AUTH_GSS_SPKMP;
break;
default:
return 0;
}
mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
+ nfs_set_auth_parsed_mount_data(mnt, pseudoflavor);
return 1;
}
@@ -1729,7 +1737,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
* Was a sec= authflavor specified in the options? First, verify
* whether the server supports it, and then just try to use it if so.
*/
- if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+ if (args->auth_flavor_len > 0) {
status = nfs_verify_authflavor(args, authlist, authlist_len);
dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
if (status)
@@ -1760,7 +1768,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
/* Fallthrough */
}
dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
- args->auth_flavors[0] = flavor;
+ nfs_set_auth_parsed_mount_data(args, flavor);
server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
if (!IS_ERR(server))
return server;
@@ -1776,7 +1784,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
/* Last chance! Try AUTH_UNIX */
dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
- args->auth_flavors[0] = RPC_AUTH_UNIX;
+ nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
}
@@ -1893,6 +1901,7 @@ static int nfs23_validate_mount_data(void *options,
{
struct nfs_mount_data *data = (struct nfs_mount_data *)options;
struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+ int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;
if (data == NULL)
goto out_no_data;
@@ -1908,6 +1917,8 @@ static int nfs23_validate_mount_data(void *options,
goto out_no_v3;
data->root.size = NFS2_FHSIZE;
memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+ /* Turn off security negotiation */
+ extra_flags |= NFS_MOUNT_SECFLAVOUR;
case 4:
if (data->flags & NFS_MOUNT_SECFLAVOUR)
goto out_no_sec;
@@ -1935,7 +1946,7 @@ static int nfs23_validate_mount_data(void *options,
* can deal with.
*/
args->flags = data->flags & NFS_MOUNT_FLAGMASK;
- args->flags |= NFS_MOUNT_LEGACY_INTERFACE;
+ args->flags |= extra_flags;
args->rsize = data->rsize;
args->wsize = data->wsize;
args->timeo = data->timeo;
@@ -1959,9 +1970,10 @@ static int nfs23_validate_mount_data(void *options,
args->namlen = data->namlen;
args->bsize = data->bsize;
- args->auth_flavors[0] = RPC_AUTH_UNIX;
if (data->flags & NFS_MOUNT_SECFLAVOUR)
- args->auth_flavors[0] = data->pseudoflavor;
+ nfs_set_auth_parsed_mount_data(args, data->pseudoflavor);
+ else
+ nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
if (!args->nfs_server.hostname)
goto out_nomem;
@@ -2084,6 +2096,8 @@ static int nfs_validate_text_mount_data(void *options,
max_namelen = NFS4_MAXNAMLEN;
max_pathlen = NFS4_MAXPATHLEN;
nfs_validate_transport_protocol(args);
+ if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
nfs4_validate_mount_flags(args);
#else
goto out_v4_not_compiled;
@@ -2106,6 +2120,10 @@ static int nfs_validate_text_mount_data(void *options,
out_v4_not_compiled:
dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
return -EPROTONOSUPPORT;
+#else
+out_invalid_transport_udp:
+ dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
+ return -EINVAL;
#endif /* !CONFIG_NFS_V4 */
out_no_address:
@@ -2170,7 +2188,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->rsize = nfss->rsize;
data->wsize = nfss->wsize;
data->retrans = nfss->client->cl_timeout->to_retries;
- data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+ nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor);
data->acregmin = nfss->acregmin / HZ;
data->acregmax = nfss->acregmax / HZ;
data->acdirmin = nfss->acdirmin / HZ;
@@ -2277,6 +2295,18 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
nfs_initialise_sb(sb);
}
+#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+ | NFS_MOUNT_SECURE \
+ | NFS_MOUNT_TCP \
+ | NFS_MOUNT_VER3 \
+ | NFS_MOUNT_KERBEROS \
+ | NFS_MOUNT_NONLM \
+ | NFS_MOUNT_BROKEN_SUID \
+ | NFS_MOUNT_STRICTLOCK \
+ | NFS_MOUNT_UNSHARED \
+ | NFS_MOUNT_NORESVPORT \
+ | NFS_MOUNT_LEGACY_INTERFACE)
+
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
{
const struct nfs_server *a = s->s_fs_info;
@@ -2287,7 +2317,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
goto Ebusy;
if (a->nfs_client != b->nfs_client)
goto Ebusy;
- if (a->flags != b->flags)
+ if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK)
goto Ebusy;
if (a->wsize != b->wsize)
goto Ebusy;
@@ -2301,7 +2331,8 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
goto Ebusy;
if (a->acdirmax != b->acdirmax)
goto Ebusy;
- if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+ if (b->flags & NFS_MOUNT_SECFLAVOUR &&
+ clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
goto Ebusy;
return 1;
Ebusy:
@@ -2478,6 +2509,10 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
if (server->flags & NFS_MOUNT_NOAC)
sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+ if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
+ if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
+ sb_mntdata.mntflags |= MS_SYNCHRONOUS;
+
/* Get a superblock - note that we may end up sharing one that already exists */
s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
if (IS_ERR(s)) {
@@ -2669,15 +2704,17 @@ static int nfs4_validate_mount_data(void *options,
goto out_no_address;
args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
- args->auth_flavors[0] = RPC_AUTH_UNIX;
if (data->auth_flavourlen) {
+ rpc_authflavor_t pseudoflavor;
if (data->auth_flavourlen > 1)
goto out_inval_auth;
- if (copy_from_user(&args->auth_flavors[0],
+ if (copy_from_user(&pseudoflavor,
data->auth_flavours,
- sizeof(args->auth_flavors[0])))
+ sizeof(pseudoflavor)))
return -EFAULT;
- }
+ nfs_set_auth_parsed_mount_data(args, pseudoflavor);
+ } else
+ nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
if (IS_ERR(c))
@@ -2711,6 +2748,8 @@ static int nfs4_validate_mount_data(void *options,
args->acdirmax = data->acdirmax;
args->nfs_server.protocol = data->proto;
nfs_validate_transport_protocol(args);
+ if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
break;
default:
@@ -2731,6 +2770,10 @@ out_inval_auth:
out_no_address:
dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
return -EINVAL;
+
+out_invalid_transport_udp:
+ dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n");
+ return -EINVAL;
}
/*
@@ -2746,6 +2789,7 @@ bool nfs4_disable_idmapping = true;
unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
unsigned short send_implementation_id = 1;
char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
+bool recover_lost_locks = false;
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
@@ -2754,6 +2798,7 @@ EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
EXPORT_SYMBOL_GPL(max_session_slots);
EXPORT_SYMBOL_GPL(send_implementation_id);
EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
+EXPORT_SYMBOL_GPL(recover_lost_locks);
#define NFS_CALLBACK_MAXPORTNR (65535U)
@@ -2791,4 +2836,10 @@ MODULE_PARM_DESC(send_implementation_id,
"Send implementation ID with NFSv4.1 exchange_id");
MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
+module_param(recover_lost_locks, bool, 0644);
+MODULE_PARM_DESC(recover_lost_locks,
+ "If the server reports that a lock might be lost, "
+ "try to recover it risking data corruption.");
+
+
#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 60395ad3a2e4..bb939edd4c99 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -20,6 +20,8 @@
#include "iostat.h"
#include "delegation.h"
+#include "nfstrace.h"
+
/**
* nfs_free_unlinkdata - release data from a sillydelete operation.
* @data: pointer to unlink structure.
@@ -77,6 +79,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
struct nfs_unlinkdata *data = calldata;
struct inode *dir = data->dir;
+ trace_nfs_sillyrename_unlink(data, task->tk_status);
if (!NFS_PROTO(dir)->unlink_done(task, dir))
rpc_restart_call_prepare(task);
}
@@ -204,6 +207,13 @@ out_free:
return ret;
}
+void nfs_wait_on_sillyrename(struct dentry *dentry)
+{
+ struct nfs_inode *nfsi = NFS_I(dentry->d_inode);
+
+ wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1);
+}
+
void nfs_block_sillyrename(struct dentry *dentry)
{
struct nfs_inode *nfsi = NFS_I(dentry->d_inode);
@@ -336,6 +346,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
struct inode *new_dir = data->new_dir;
struct dentry *old_dentry = data->old_dentry;
+ trace_nfs_sillyrename_rename(old_dir, old_dentry,
+ new_dir, data->new_dentry, task->tk_status);
if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
rpc_restart_call_prepare(task);
return;
@@ -444,6 +456,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
return rpc_run_task(&task_setup_data);
}
+#define SILLYNAME_PREFIX ".nfs"
+#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
+#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
+#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1)
+#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \
+ SILLYNAME_FILEID_LEN + \
+ SILLYNAME_COUNTER_LEN)
+
/**
* nfs_sillyrename - Perform a silly-rename of a dentry
* @dir: inode of directory that contains dentry
@@ -469,10 +489,8 @@ int
nfs_sillyrename(struct inode *dir, struct dentry *dentry)
{
static unsigned int sillycounter;
- const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2;
- const int countersize = sizeof(sillycounter)*2;
- const int slen = sizeof(".nfs")+fileidsize+countersize-1;
- char silly[slen+1];
+ unsigned char silly[SILLYNAME_LEN + 1];
+ unsigned long long fileid;
struct dentry *sdentry;
struct rpc_task *task;
int error = -EIO;
@@ -489,20 +507,20 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
goto out;
- sprintf(silly, ".nfs%*.*Lx",
- fileidsize, fileidsize,
- (unsigned long long)NFS_FILEID(dentry->d_inode));
+ fileid = NFS_FILEID(dentry->d_inode);
/* Return delegation in anticipation of the rename */
NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode);
sdentry = NULL;
do {
- char *suffix = silly + slen - countersize;
-
+ int slen;
dput(sdentry);
sillycounter++;
- sprintf(suffix, "%*.*x", countersize, countersize, sillycounter);
+ slen = scnprintf(silly, sizeof(silly),
+ SILLYNAME_PREFIX "%0*llx%0*x",
+ SILLYNAME_FILEID_LEN, fileid,
+ SILLYNAME_COUNTER_LEN, sillycounter);
dfprintk(VFS, "NFS: trying to rename %s to %s\n",
dentry->d_name.name, silly);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f1bdb7254776..ac1dc331ba31 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -31,6 +31,8 @@
#include "fscache.h"
#include "pnfs.h"
+#include "nfstrace.h"
+
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
#define MIN_POOL_WRITE (32)
@@ -861,7 +863,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
return 0;
l_ctx = req->wb_lock_context;
do_flush = req->wb_page != page || req->wb_context != ctx;
- if (l_ctx) {
+ if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
do_flush |= l_ctx->lockowner.l_owner != current->files
|| l_ctx->lockowner.l_pid != current->tgid;
}
@@ -874,6 +876,33 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
}
/*
+ * Avoid buffered writes when a open context credential's key would
+ * expire soon.
+ *
+ * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL.
+ *
+ * Return 0 and set a credential flag which triggers the inode to flush
+ * and performs NFS_FILE_SYNC writes if the key will expired within
+ * RPC_KEY_EXPIRE_TIMEO.
+ */
+int
+nfs_key_timeout_notify(struct file *filp, struct inode *inode)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+ struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+
+ return rpcauth_key_timeout_notify(auth, ctx->cred);
+}
+
+/*
+ * Test if the open context credential key is marked to expire soon.
+ */
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
+{
+ return rpcauth_cred_key_to_expire(ctx->cred);
+}
+
+/*
* If the page cache is marked as unsafe or invalid, then we can't rely on
* the PageUptodate() flag. In this case, we will need to turn off
* write optimisations that depend on the page contents being correct.
@@ -993,6 +1022,9 @@ int nfs_initiate_write(struct rpc_clnt *clnt,
data->args.count,
(unsigned long long)data->args.offset);
+ nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
+ &task_setup_data.rpc_client, &msg, data);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task)) {
ret = PTR_ERR(task);
@@ -1265,9 +1297,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
void nfs_write_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
- NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
- rpc_exit(task, -EIO);
+ int err;
+ err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
+ if (err)
+ rpc_exit(task, err);
}
void nfs_commit_prepare(struct rpc_task *task, void *calldata)
@@ -1458,6 +1491,9 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+ nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
+ NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -1732,8 +1768,14 @@ int nfs_wb_all(struct inode *inode)
.range_start = 0,
.range_end = LLONG_MAX,
};
+ int ret;
- return sync_inode(inode, &wbc);
+ trace_nfs_writeback_inode_enter(inode);
+
+ ret = sync_inode(inode, &wbc);
+
+ trace_nfs_writeback_inode_exit(inode, ret);
+ return ret;
}
EXPORT_SYMBOL_GPL(nfs_wb_all);
@@ -1781,6 +1823,8 @@ int nfs_wb_page(struct inode *inode, struct page *page)
};
int ret;
+ trace_nfs_writeback_page_enter(inode);
+
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
@@ -1789,14 +1833,15 @@ int nfs_wb_page(struct inode *inode, struct page *page)
goto out_error;
continue;
}
+ ret = 0;
if (!PagePrivate(page))
break;
ret = nfs_commit_inode(inode, FLUSH_SYNC);
if (ret < 0)
goto out_error;
}
- return 0;
out_error:
+ trace_nfs_writeback_page_exit(inode, ret);
return ret;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a7cee864e7b2..419572f33b72 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1293,7 +1293,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
* According to RFC3010, this takes precedence over all other errors.
*/
status = nfserr_minor_vers_mismatch;
- if (args->minorversion > nfsd_supported_minorversion)
+ if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
goto out;
status = nfs41_check_op_ordering(args);
@@ -1524,7 +1524,7 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
- 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+ 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\
2 + /*eir_server_owner.so_minor_id */\
/* eir_server_owner.so_major_id<> */\
XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 280acef6f0dc..43f42290e5df 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1264,6 +1264,8 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
struct svc_cred *cr = &rqstp->rq_cred;
u32 service;
+ if (!cr->cr_gss_mech)
+ return false;
service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
return service == RPC_GSS_SVC_INTEGRITY ||
service == RPC_GSS_SVC_PRIVACY;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0c0f3ea90de5..d9454fe5653f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1816,10 +1816,7 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
static __be32 nfsd4_encode_path(const struct path *root,
const struct path *path, __be32 **pp, int *buflen)
{
- struct path cur = {
- .mnt = path->mnt,
- .dentry = path->dentry,
- };
+ struct path cur = *path;
__be32 *p = *pp;
struct dentry **components = NULL;
unsigned int ncomponents = 0;
@@ -1859,14 +1856,19 @@ static __be32 nfsd4_encode_path(const struct path *root,
while (ncomponents) {
struct dentry *dentry = components[ncomponents - 1];
- unsigned int len = dentry->d_name.len;
+ unsigned int len;
+ spin_lock(&dentry->d_lock);
+ len = dentry->d_name.len;
*buflen -= 4 + (XDR_QUADLEN(len) << 2);
- if (*buflen < 0)
+ if (*buflen < 0) {
+ spin_unlock(&dentry->d_lock);
goto out_free;
+ }
WRITE32(len);
WRITEMEM(dentry->d_name.name, len);
dprintk("/%s", dentry->d_name.name);
+ spin_unlock(&dentry->d_lock);
dput(dentry);
ncomponents--;
}
@@ -3360,7 +3362,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
8 /* eir_clientid */ +
4 /* eir_sequenceid */ +
4 /* eir_flags */ +
- 4 /* spr_how (SP4_NONE) */ +
+ 4 /* spr_how */ +
+ 8 /* spo_must_enforce, spo_must_allow */ +
8 /* so_minor_id */ +
4 /* so_major_id.len */ +
(XDR_QUADLEN(major_id_sz) * 4) +
@@ -3372,8 +3375,6 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
WRITE32(exid->seqid);
WRITE32(exid->flags);
- /* state_protect4_r. Currently only support SP4_NONE */
- BUG_ON(exid->spa_how != SP4_NONE);
WRITE32(exid->spa_how);
switch (exid->spa_how) {
case SP4_NONE:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 2bbd94e51efc..30f34ab02137 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -53,7 +53,6 @@ struct readdir_cd {
extern struct svc_program nfsd_program;
extern struct svc_version nfsd_version2, nfsd_version3,
nfsd_version4;
-extern u32 nfsd_supported_minorversion;
extern struct mutex nfsd_mutex;
extern spinlock_t nfsd_drc_lock;
extern unsigned long nfsd_drc_max_mem;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 6b9f48ca4c25..760c85a6f534 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,10 @@ struct svc_program nfsd_program = {
};
-u32 nfsd_supported_minorversion = 1;
+static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
+ [0] = 1,
+ [1] = 1,
+};
int nfsd_vers(int vers, enum vers_op change)
{
@@ -151,15 +154,13 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
return -1;
switch(change) {
case NFSD_SET:
- nfsd_supported_minorversion = minorversion;
+ nfsd_supported_minorversions[minorversion] = true;
break;
case NFSD_CLEAR:
- if (minorversion == 0)
- return -1;
- nfsd_supported_minorversion = minorversion - 1;
+ nfsd_supported_minorversions[minorversion] = false;
break;
case NFSD_TEST:
- return minorversion <= nfsd_supported_minorversion;
+ return nfsd_supported_minorversions[minorversion];
case NFSD_AVAIL:
return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8ff6a0019b0b..c827acb0e943 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -830,9 +830,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
flags = O_WRONLY|O_LARGEFILE;
}
*filp = dentry_open(&path, flags, current_cred());
- if (IS_ERR(*filp))
+ if (IS_ERR(*filp)) {
host_err = PTR_ERR(*filp);
- else {
+ *filp = NULL;
+ } else {
host_err = ima_file_check(*filp, may_flags);
if (may_flags & NFSD_MAY_64BIT_COOKIE)
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc9a913784ab..2d8be51f90dc 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -345,8 +345,7 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
if (err == -EOPNOTSUPP) {
set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- bio_put(bio);
- /* to be detected by submit_seg_bio() */
+ /* to be detected by nilfs_segbuf_submit_bio() */
}
if (!uptodate)
@@ -377,12 +376,12 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
bio->bi_private = segbuf;
bio_get(bio);
submit_bio(mode, bio);
+ segbuf->sb_nbio++;
if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
bio_put(bio);
err = -EOPNOTSUPP;
goto failed;
}
- segbuf->sb_nbio++;
bio_put(bio);
wi->bio = NULL;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index af3ba0478cdf..7ac2a122ca1d 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -994,23 +994,16 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
return ret;
}
-static int nilfs_tree_was_touched(struct dentry *root_dentry)
-{
- return d_count(root_dentry) > 1;
-}
-
/**
- * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint
+ * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint
* @root_dentry: root dentry of the tree to be shrunk
*
* This function returns true if the tree was in-use.
*/
-static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
+static bool nilfs_tree_is_busy(struct dentry *root_dentry)
{
- if (have_submounts(root_dentry))
- return true;
shrink_dcache_parent(root_dentry);
- return nilfs_tree_was_touched(root_dentry);
+ return d_count(root_dentry) > 1;
}
int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
@@ -1034,8 +1027,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
if (inode) {
dentry = d_find_alias(inode);
if (dentry) {
- if (nilfs_tree_was_touched(dentry))
- ret = nilfs_try_to_shrink_tree(dentry);
+ ret = nilfs_tree_is_busy(dentry);
dput(dentry);
}
iput(inode);
@@ -1331,11 +1323,8 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
s->s_flags |= MS_ACTIVE;
} else if (!sd.cno) {
- int busy = false;
-
- if (nilfs_tree_was_touched(s->s_root)) {
- busy = nilfs_try_to_shrink_tree(s->s_root);
- if (busy && (flags ^ s->s_flags) & MS_RDONLY) {
+ if (nilfs_tree_is_busy(s->s_root)) {
+ if ((flags ^ s->s_flags) & MS_RDONLY) {
printk(KERN_ERR "NILFS: the device already "
"has a %s mount.\n",
(s->s_flags & MS_RDONLY) ?
@@ -1343,8 +1332,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
err = -EBUSY;
goto failed_super;
}
- }
- if (!busy) {
+ } else {
/*
* Try remount to setup mount states if the current
* tree is not mounted and only snapshots use this sb.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 79736a28d84f..94417a85ce6e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -565,9 +565,7 @@ bail:
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
int level;
@@ -592,10 +590,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
-
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
}
/*
@@ -1757,7 +1751,7 @@ try_again:
goto out;
} else if (ret == 1) {
clusters_need = wc->w_clen;
- ret = ocfs2_refcount_cow(inode, filp, di_bh,
+ ret = ocfs2_refcount_cow(inode, di_bh,
wc->w_cpos, wc->w_clen, UINT_MAX);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index eb760d8acd50..30544ce8e9f7 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2153,11 +2153,9 @@ int ocfs2_empty_dir(struct inode *inode)
{
int ret;
struct ocfs2_empty_dir_priv priv = {
- .ctx.actor = ocfs2_empty_dir_filldir
+ .ctx.actor = ocfs2_empty_dir_filldir,
};
- memset(&priv, 0, sizeof(priv));
-
if (ocfs2_dir_indexed(inode)) {
ret = ocfs2_empty_dir_dx(inode, &priv);
if (ret)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 41000f223ca4..3261d71319ee 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -370,7 +370,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
goto out;
- return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
+ return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
out:
return status;
@@ -899,7 +899,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
zero_clusters = last_cpos - zero_cpos;
if (needs_cow) {
- rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
zero_clusters, UINT_MAX);
if (rc) {
mlog_errno(rc);
@@ -2078,7 +2078,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
*meta_level = 1;
- ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
if (ret)
mlog_errno(ret);
out:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 96f9ac237e86..0a992737dcaf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -537,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
- ocfs2_quota_trans_credits(sb) + bits_wanted;
+ ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index f1fc172175b6..452068b45749 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -69,7 +69,7 @@ static int __ocfs2_move_extent(handle_t *handle,
u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
- ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+ ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
p_cpos, new_p_cpos, len);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 998b17eda09d..a70d604593b6 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,7 +49,6 @@
struct ocfs2_cow_context {
struct inode *inode;
- struct file *file;
u32 cow_start;
u32 cow_len;
struct ocfs2_extent_tree data_et;
@@ -66,7 +65,7 @@ struct ocfs2_cow_context {
u32 *num_clusters,
unsigned int *extent_flags);
int (*cow_duplicate_clusters)(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
};
@@ -2922,14 +2921,12 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
}
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
- struct inode *inode = file_inode(file);
- struct ocfs2_caching_info *ci = INODE_CACHE(inode);
- struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct super_block *sb = inode->i_sb;
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
@@ -2965,6 +2962,11 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
to = map_end & (PAGE_CACHE_SIZE - 1);
page = find_or_create_page(mapping, page_index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ break;
+ }
/*
* In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2973,13 +2975,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
- if (PageReadahead(page)) {
- page_cache_async_readahead(mapping,
- &file->f_ra, file,
- page, page_index,
- readahead_pages);
- }
-
if (!PageUptodate(page)) {
ret = block_read_full_page(page, ocfs2_get_block);
if (ret) {
@@ -2999,7 +2994,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
}
- ocfs2_map_and_dirty_page(inode, handle, from, to,
+ ocfs2_map_and_dirty_page(inode,
+ handle, from, to,
page, 0, &new_block);
mark_page_accessed(page);
unlock:
@@ -3015,12 +3011,11 @@ unlock:
}
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0;
- struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
@@ -3145,7 +3140,7 @@ static int ocfs2_replace_clusters(handle_t *handle,
/*If the old clusters is unwritten, no need to duplicate. */
if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = context->cow_duplicate_clusters(handle, context->file,
+ ret = context->cow_duplicate_clusters(handle, context->inode,
cpos, old, new, len);
if (ret) {
mlog_errno(ret);
@@ -3423,35 +3418,12 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
return ret;
}
-static void ocfs2_readahead_for_cow(struct inode *inode,
- struct file *file,
- u32 start, u32 len)
-{
- struct address_space *mapping;
- pgoff_t index;
- unsigned long num_pages;
- int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
-
- if (!file)
- return;
-
- mapping = file->f_mapping;
- num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
- if (!num_pages)
- num_pages = 1;
-
- index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
- page_cache_sync_readahead(mapping, &file->f_ra, file,
- index, num_pages);
-}
-
/*
* Starting at cpos, try to CoW write_len clusters. Don't CoW
* past max_cpos. This will stop when it runs into a hole or an
* unrefcounted extent.
*/
static int ocfs2_refcount_cow_hunk(struct inode *inode,
- struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
@@ -3480,8 +3452,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
BUG_ON(cow_len == 0);
- ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
-
context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
if (!context) {
ret = -ENOMEM;
@@ -3503,7 +3473,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
context->ref_root_bh = ref_root_bh;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
context->get_clusters = ocfs2_di_get_clusters;
- context->file = file;
ocfs2_init_dinode_extent_tree(&context->data_et,
INODE_CACHE(inode), di_bh);
@@ -3532,7 +3501,6 @@ out:
* clusters between cpos and cpos+write_len are safe to modify.
*/
int ocfs2_refcount_cow(struct inode *inode,
- struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
@@ -3552,7 +3520,7 @@ int ocfs2_refcount_cow(struct inode *inode,
num_clusters = write_len;
if (ext_flags & OCFS2_EXT_REFCOUNTED) {
- ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
+ ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
num_clusters, max_cpos);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 7754608c83a4..6422bbcdb525 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -53,7 +53,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
int *credits,
int *ref_blocks);
int ocfs2_refcount_cow(struct inode *inode,
- struct file *filep, struct buffer_head *di_bh,
+ struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos);
typedef int (ocfs2_post_refcount_func)(struct inode *inode,
@@ -85,11 +85,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post);
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
- struct file *file,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
int ocfs2_cow_sync_writeback(struct super_block *sb,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 854d80955bf8..121da2dc3be8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1022,7 +1022,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode = NULL;
struct ocfs2_super *osb = NULL;
struct buffer_head *bh = NULL;
- char nodestr[8];
+ char nodestr[12];
struct ocfs2_blockcheck_stats stats;
trace_ocfs2_fill_super(sb, data, silent);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index e5c7f15465b4..19f134e896a9 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -32,7 +32,7 @@ enum ocfs2_xattr_type {
struct ocfs2_security_xattr_info {
int enable;
- char *name;
+ const char *name;
void *value;
size_t value_len;
};
diff --git a/fs/open.c b/fs/open.c
index 9156cb050d08..2a731b0d08bc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -443,7 +443,7 @@ retry:
goto dput_and_out;
error = -EPERM;
- if (!nsown_capable(CAP_SYS_CHROOT))
+ if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
goto dput_and_out;
error = security_path_chroot(&path);
if (error)
@@ -485,14 +485,13 @@ out_unlock:
SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
- struct file * file;
+ struct fd f = fdget(fd);
int err = -EBADF;
- file = fget(fd);
- if (file) {
- audit_inode(NULL, file->f_path.dentry, 0);
- err = chmod_common(&file->f_path, mode);
- fput(file);
+ if (f.file) {
+ audit_inode(NULL, f.file->f_path.dentry, 0);
+ err = chmod_common(&f.file->f_path, mode);
+ fdput(f);
}
return err;
}
@@ -823,7 +822,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
int lookup_flags = 0;
int acc_mode;
- if (flags & O_CREAT)
+ if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
else
op->mode = 0;
@@ -844,6 +843,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
acc_mode = MAY_OPEN | ACC_MODE(flags);
+ if (!(acc_mode & MAY_WRITE))
+ return -EINVAL;
} else if (flags & O_PATH) {
/*
* If we have O_PATH in the open flag. Then we
diff --git a/fs/pnode.h b/fs/pnode.h
index b091445c1c4a..59e7eda1851e 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -19,11 +19,14 @@
#define CL_EXPIRE 0x01
#define CL_SLAVE 0x02
-#define CL_COPY_ALL 0x04
+#define CL_COPY_UNBINDABLE 0x04
#define CL_MAKE_SHARED 0x08
#define CL_PRIVATE 0x10
#define CL_SHARED_TO_SLAVE 0x20
#define CL_UNPRIVILEGED 0x40
+#define CL_COPY_MNT_NS_FILE 0x80
+
+#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
static inline void set_mnt_shared(struct mount *mnt)
{
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 75f2890abbd8..0ff80f9b930f 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -230,8 +230,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- if (!dir_emit_dots(file, ctx))
- goto out;
files = get_files_struct(p);
if (!files)
goto out;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 94441a407337..737e15615b04 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -271,7 +271,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
de = next;
} while (de);
spin_unlock(&proc_subdir_lock);
- return 0;
+ return 1;
}
int proc_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 073aea60cf8f..9f8ef9b7674d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -285,6 +285,20 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
return rv;
}
+static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ int rv = -EIO;
+ unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ if (use_pde(pde)) {
+ get_unmapped_area = pde->proc_fops->get_unmapped_area;
+ if (get_unmapped_area)
+ rv = get_unmapped_area(file, orig_addr, len, pgoff, flags);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
static int proc_reg_open(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
@@ -356,6 +370,7 @@ static const struct file_operations proc_reg_file_ops = {
.compat_ioctl = proc_reg_compat_ioctl,
#endif
.mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
.release = proc_reg_release,
};
@@ -368,6 +383,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
.release = proc_reg_release,
};
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 229e366598da..87dbcbef7fe4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,11 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
ns = task_active_pid_ns(current);
options = data;
- if (!current_user_ns()->may_mount_proc)
+ if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
+ return ERR_PTR(-EPERM);
+
+ /* Does the mounter have privilege over the pid namespace? */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
}
@@ -205,7 +209,9 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
if (ctx->pos < FIRST_PROCESS_ENTRY) {
- proc_readdir(file, ctx);
+ int error = proc_readdir(file, ctx);
+ if (unlikely(error <= 0))
+ return error;
ctx->pos = FIRST_PROCESS_ENTRY;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dbf61f6174f0..107d026f5d6e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -730,8 +730,16 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
* of how soft-dirty works.
*/
pte_t ptent = *pte;
- ptent = pte_wrprotect(ptent);
- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+
+ if (pte_present(ptent)) {
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ } else if (is_swap_pte(ptent)) {
+ ptent = pte_swp_clear_soft_dirty(ptent);
+ } else if (pte_file(ptent)) {
+ ptent = pte_file_clear_soft_dirty(ptent);
+ }
+
set_pte_at(vma->vm_mm, addr, pte, ptent);
#endif
}
@@ -752,14 +760,15 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
- if (!pte_present(ptent))
- continue;
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty(vma, addr, pte);
continue;
}
+ if (!pte_present(ptent))
+ continue;
+
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
@@ -859,7 +868,7 @@ typedef struct {
} pagemap_entry_t;
struct pagemapread {
- int pos, len;
+ int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
bool v2;
};
@@ -867,7 +876,7 @@ struct pagemapread {
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
-#define PM_ENTRY_BYTES sizeof(u64)
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
@@ -930,8 +939,10 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
flags = PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
} else if (is_swap_pte(pte)) {
- swp_entry_t entry = pte_to_swp_entry(pte);
-
+ swp_entry_t entry;
+ if (pte_swp_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
+ entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags = PM_SWAP;
@@ -1116,8 +1127,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
goto out_task;
pm.v2 = soft_dirty_cleared;
- pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
- pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
+ pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+ pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
if (!pm.buffer)
goto out_task;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 28503172f2e4..a1a16eb97c7b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -223,7 +223,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
* regions in the 1st kernel pointed to by PT_LOAD entries) into
* virtually contiguous user-space in ELF layout.
*/
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
size_t size = vma->vm_end - vma->vm_start;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index ca71db69da07..983d9510becc 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,6 +1,8 @@
config PSTORE
bool "Persistent store support"
default n
+ select ZLIB_DEFLATE
+ select ZLIB_INFLATE
help
This option enables generic access to platform level
persistent storage via "pstore" filesystem that can
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 71bf5f4ae84c..12823845d324 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -275,8 +275,8 @@ int pstore_is_mounted(void)
* Set the mtime & ctime to the date that this record was originally stored.
*/
int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
- char *data, size_t size, struct timespec time,
- struct pstore_info *psi)
+ char *data, bool compressed, size_t size,
+ struct timespec time, struct pstore_info *psi)
{
struct dentry *root = pstore_sb->s_root;
struct dentry *dentry;
@@ -315,7 +315,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
switch (type) {
case PSTORE_TYPE_DMESG:
- sprintf(name, "dmesg-%s-%lld", psname, id);
+ sprintf(name, "dmesg-%s-%lld%s", psname, id,
+ compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
sprintf(name, "console-%s", psname);
@@ -345,9 +346,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
mutex_lock(&root->d_inode->i_mutex);
- rc = -ENOSPC;
dentry = d_alloc_name(root, name);
- if (IS_ERR(dentry))
+ if (!dentry)
goto fail_lockedalloc;
memcpy(private->data, data, size);
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 937d820f273c..3b3d305277c4 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,8 +50,9 @@ extern struct pstore_info *psinfo;
extern void pstore_set_kmsg_bytes(int);
extern void pstore_get_records(int);
extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
- int count, char *data, size_t size,
- struct timespec time, struct pstore_info *psi);
+ int count, char *data, bool compressed,
+ size_t size, struct timespec time,
+ struct pstore_info *psi);
extern int pstore_is_mounted(void);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 422962ae9fc2..4ffb7ab5e397 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -26,6 +26,7 @@
#include <linux/console.h>
#include <linux/module.h>
#include <linux/pstore.h>
+#include <linux/zlib.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/slab.h>
@@ -65,6 +66,15 @@ struct pstore_info *psinfo;
static char *backend;
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
+
+static char *big_oops_buf;
+static size_t big_oops_buf_sz;
+
/* How much of the console log to snapshot */
static unsigned long kmsg_bytes = 10240;
@@ -117,6 +127,121 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
}
EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
+/* Derived from logfs_compress() */
+static int pstore_compress(const void *in, void *out, size_t inlen,
+ size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+ MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_deflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ if (stream.total_out >= stream.total_in)
+ goto error;
+
+ ret = stream.total_out;
+error:
+ return ret;
+}
+
+/* Derived from logfs_uncompress */
+static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_inflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ ret = stream.total_out;
+error:
+ return ret;
+}
+
+static void allocate_buf_for_compression(void)
+{
+ size_t size;
+
+ big_oops_buf_sz = (psinfo->bufsize * 100) / 45;
+ big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+ if (big_oops_buf) {
+ size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL),
+ zlib_inflate_workspacesize());
+ stream.workspace = kmalloc(size, GFP_KERNEL);
+ if (!stream.workspace) {
+ pr_err("pstore: No memory for compression workspace; "
+ "skipping compression\n");
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ }
+ } else {
+ pr_err("No memory for uncompressed data; "
+ "skipping compression\n");
+ stream.workspace = NULL;
+ }
+
+}
+
+/*
+ * Called when compression fails, since the printk buffer
+ * would be fetched for compression calling it again when
+ * compression fails would have moved the iterator of
+ * printk buffer which results in fetching old contents.
+ * Copy the recent messages from big_oops_buf to psinfo->buf
+ */
+static size_t copy_kmsg_to_buffer(int hsize, size_t len)
+{
+ size_t total_len;
+ size_t diff;
+
+ total_len = hsize + len;
+
+ if (total_len > psinfo->bufsize) {
+ diff = total_len - psinfo->bufsize + hsize;
+ memcpy(psinfo->buf, big_oops_buf, hsize);
+ memcpy(psinfo->buf + hsize, big_oops_buf + diff,
+ psinfo->bufsize - hsize);
+ total_len = psinfo->bufsize;
+ } else
+ memcpy(psinfo->buf, big_oops_buf, total_len);
+
+ return total_len;
+}
+
/*
* callback from kmsg_dump. (s2,l2) has the most recently
* written bytes, older bytes are in (s1,l1). Save as much
@@ -148,22 +273,56 @@ static void pstore_dump(struct kmsg_dumper *dumper,
char *dst;
unsigned long size;
int hsize;
+ int zipped_len = -1;
size_t len;
+ bool compressed;
+ size_t total_len;
- dst = psinfo->buf;
- hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part);
- size = psinfo->bufsize - hsize;
- dst += hsize;
+ if (big_oops_buf) {
+ dst = big_oops_buf;
+ hsize = sprintf(dst, "%s#%d Part%d\n", why,
+ oopscount, part);
+ size = big_oops_buf_sz - hsize;
- if (!kmsg_dump_get_buffer(dumper, true, dst, size, &len))
- break;
+ if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
+ size, &len))
+ break;
+
+ zipped_len = pstore_compress(dst, psinfo->buf,
+ hsize + len, psinfo->bufsize);
+
+ if (zipped_len > 0) {
+ compressed = true;
+ total_len = zipped_len;
+ } else {
+ pr_err("pstore: compression failed for Part %d"
+ " returned %d\n", part, zipped_len);
+ pr_err("pstore: Capture uncompressed"
+ " oops/panic report of Part %d\n", part);
+ compressed = false;
+ total_len = copy_kmsg_to_buffer(hsize, len);
+ }
+ } else {
+ dst = psinfo->buf;
+ hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount,
+ part);
+ size = psinfo->bufsize - hsize;
+ dst += hsize;
+
+ if (!kmsg_dump_get_buffer(dumper, true, dst,
+ size, &len))
+ break;
+
+ compressed = false;
+ total_len = hsize + len;
+ }
ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
- oopscount, hsize, hsize + len, psinfo);
+ oopscount, compressed, total_len, psinfo);
if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
pstore_new_entry = 1;
- total += hsize + len;
+ total += total_len;
part++;
}
if (pstore_cannot_block_path(reason)) {
@@ -221,10 +380,10 @@ static void pstore_register_console(void) {}
static int pstore_write_compat(enum pstore_type_id type,
enum kmsg_dump_reason reason,
u64 *id, unsigned int part, int count,
- size_t hsize, size_t size,
+ bool compressed, size_t size,
struct pstore_info *psi)
{
- return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
+ return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
size, psi);
}
@@ -261,6 +420,8 @@ int pstore_register(struct pstore_info *psi)
return -EINVAL;
}
+ allocate_buf_for_compression();
+
if (pstore_is_mounted())
pstore_get_records(0);
@@ -297,6 +458,8 @@ void pstore_get_records(int quiet)
enum pstore_type_id type;
struct timespec time;
int failed = 0, rc;
+ bool compressed;
+ int unzipped_len = -1;
if (!psi)
return;
@@ -305,11 +468,32 @@ void pstore_get_records(int quiet)
if (psi->open && psi->open(psi))
goto out;
- while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) {
+ while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
+ psi)) > 0) {
+ if (compressed && (type == PSTORE_TYPE_DMESG)) {
+ if (big_oops_buf)
+ unzipped_len = pstore_decompress(buf,
+ big_oops_buf, size,
+ big_oops_buf_sz);
+
+ if (unzipped_len > 0) {
+ buf = big_oops_buf;
+ size = unzipped_len;
+ compressed = false;
+ } else {
+ pr_err("pstore: decompression failed;"
+ "returned %d\n", unzipped_len);
+ compressed = true;
+ }
+ }
rc = pstore_mkfile(type, psi->name, id, count, buf,
- (size_t)size, time, psi);
- kfree(buf);
- buf = NULL;
+ compressed, (size_t)size, time, psi);
+ if (unzipped_len < 0) {
+ /* Free buffer other than big oops */
+ kfree(buf);
+ buf = NULL;
+ } else
+ unzipped_len = -1;
if (rc && (rc != -EEXIST || !quiet))
failed++;
}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index a6119f9469e2..fa8cef2cca3a 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -131,9 +131,31 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
return prz;
}
+static void ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
+ bool *compressed)
+{
+ char data_type;
+
+ if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
+ &time->tv_sec, &time->tv_nsec, &data_type) == 3) {
+ if (data_type == 'C')
+ *compressed = true;
+ else
+ *compressed = false;
+ } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
+ &time->tv_sec, &time->tv_nsec) == 2) {
+ *compressed = false;
+ } else {
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ *compressed = false;
+ }
+}
+
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time,
- char **buf, struct pstore_info *psi)
+ char **buf, bool *compressed,
+ struct pstore_info *psi)
{
ssize_t size;
ssize_t ecc_notice_size;
@@ -152,10 +174,6 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz)
return 0;
- /* TODO(kees): Bogus time for the moment. */
- time->tv_sec = 0;
- time->tv_nsec = 0;
-
size = persistent_ram_old_size(prz);
/* ECC correction notice */
@@ -166,12 +184,14 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
return -ENOMEM;
memcpy(*buf, persistent_ram_old(prz), size);
+ ramoops_read_kmsg_hdr(*buf, time, compressed);
persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1);
return size + ecc_notice_size;
}
-static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
+static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
+ bool compressed)
{
char *hdr;
struct timespec timestamp;
@@ -182,8 +202,9 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
timestamp.tv_sec = 0;
timestamp.tv_nsec = 0;
}
- hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
- (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000));
+ hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n",
+ (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000),
+ compressed ? 'C' : 'D');
WARN_ON_ONCE(!hdr);
len = hdr ? strlen(hdr) : 0;
persistent_ram_write(prz, hdr, len);
@@ -196,7 +217,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
enum kmsg_dump_reason reason,
u64 *id, unsigned int part,
const char *buf,
- size_t hsize, size_t size,
+ bool compressed, size_t size,
struct pstore_info *psi)
{
struct ramoops_context *cxt = psi->data;
@@ -242,7 +263,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
prz = cxt->przs[cxt->dump_write_cnt];
- hlen = ramoops_write_kmsg_hdr(prz);
+ hlen = ramoops_write_kmsg_hdr(prz, compressed);
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
persistent_ram_write(prz, buf, size);
@@ -400,11 +421,11 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
- if (!is_power_of_2(pdata->record_size))
+ if (pdata->record_size && !is_power_of_2(pdata->record_size))
pdata->record_size = rounddown_pow_of_two(pdata->record_size);
- if (!is_power_of_2(pdata->console_size))
+ if (pdata->console_size && !is_power_of_2(pdata->console_size))
pdata->console_size = rounddown_pow_of_two(pdata->console_size);
- if (!is_power_of_2(pdata->ftrace_size))
+ if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
cxt->dump_read_cnt = 0;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index fbad622841f9..9a702e193538 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1094,6 +1094,14 @@ static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
dquot->dq_dqb.dqb_rsvspace -= number;
}
+static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number)
+{
+ if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
+ number = dquot->dq_dqb.dqb_curspace;
+ dquot->dq_dqb.dqb_rsvspace += number;
+ dquot->dq_dqb.dqb_curspace -= number;
+}
+
static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
@@ -1528,6 +1536,15 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number)
}
EXPORT_SYMBOL(inode_claim_rsv_space);
+void inode_reclaim_rsv_space(struct inode *inode, qsize_t number)
+{
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) += number;
+ __inode_sub_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(inode_reclaim_rsv_space);
+
void inode_sub_rsv_space(struct inode *inode, qsize_t number)
{
spin_lock(&inode->i_lock);
@@ -1702,6 +1719,35 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
EXPORT_SYMBOL(dquot_claim_space_nodirty);
/*
+ * Convert allocated space back to in-memory reserved quotas
+ */
+void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
+{
+ int cnt;
+
+ if (!dquot_active(inode)) {
+ inode_reclaim_rsv_space(inode, number);
+ return;
+ }
+
+ down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ spin_lock(&dq_data_lock);
+ /* Claim reserved quotas to allocated quotas */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (inode->i_dquot[cnt])
+ dquot_reclaim_reserved_space(inode->i_dquot[cnt],
+ number);
+ }
+ /* Update inode bytes */
+ inode_reclaim_rsv_space(inode, number);
+ spin_unlock(&dq_data_lock);
+ mark_all_dquot_dirty(inode->i_dquot);
+ up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ return;
+}
+EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
+
+/*
* This operation can block, but only after everything is updated
*/
void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index a98b7740a0fc..dc9a6829f7c6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -423,8 +423,11 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
journal_mark_dirty(th, s, sbh);
- if (for_unformatted)
+ if (for_unformatted) {
+ int depth = reiserfs_write_unlock_nested(s);
dquot_free_block_nodirty(inode, 1);
+ reiserfs_write_lock_nested(s, depth);
+ }
}
void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1128,6 +1131,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
int passno = 0;
int nr_allocated = 0;
+ int depth;
determine_prealloc_size(hint);
if (!hint->formatted_node) {
@@ -1137,10 +1141,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
"reiserquota: allocating %d blocks id=%u",
amount_needed, hint->inode->i_uid);
#endif
+ depth = reiserfs_write_unlock_nested(s);
quota_ret =
dquot_alloc_block_nodirty(hint->inode, amount_needed);
- if (quota_ret) /* Quota exceeded? */
+ if (quota_ret) { /* Quota exceeded? */
+ reiserfs_write_lock_nested(s, depth);
return QUOTA_EXCEEDED;
+ }
if (hint->preallocate && hint->prealloc_size) {
#ifdef REISERQUOTA_DEBUG
reiserfs_debug(s, REISERFS_DEBUG_CODE,
@@ -1153,6 +1160,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
hint->preallocate = hint->prealloc_size = 0;
}
/* for unformatted nodes, force large allocations */
+ reiserfs_write_lock_nested(s, depth);
}
do {
@@ -1181,9 +1189,11 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
hint->inode->i_uid);
#endif
/* Free not allocated blocks */
+ depth = reiserfs_write_unlock_nested(s);
dquot_free_block_nodirty(hint->inode,
amount_needed + hint->prealloc_size -
nr_allocated);
+ reiserfs_write_lock_nested(s, depth);
}
while (nr_allocated--)
reiserfs_free_block(hint->th, hint->inode,
@@ -1214,10 +1224,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
REISERFS_I(hint->inode)->i_prealloc_count,
hint->inode->i_uid);
#endif
+
+ depth = reiserfs_write_unlock_nested(s);
dquot_free_block_nodirty(hint->inode, amount_needed +
hint->prealloc_size - nr_allocated -
REISERFS_I(hint->inode)->
i_prealloc_count);
+ reiserfs_write_lock_nested(s, depth);
}
return CARRY_ON;
@@ -1340,10 +1353,11 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
"reading failed", __func__, block);
else {
if (buffer_locked(bh)) {
+ int depth;
PROC_INFO_INC(sb, scan_bitmap.wait);
- reiserfs_write_unlock(sb);
+ depth = reiserfs_write_unlock_nested(sb);
__wait_on_buffer(bh);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(sb, depth);
}
BUG_ON(!buffer_uptodate(bh));
BUG_ON(atomic_read(&bh->b_count) == 0);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 03e4ca5624d6..1fd2051109a3 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -71,6 +71,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
char small_buf[32]; /* avoid kmalloc if we can */
struct reiserfs_dir_entry de;
int ret = 0;
+ int depth;
reiserfs_write_lock(inode->i_sb);
@@ -181,17 +182,17 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
* Since filldir might sleep, we can release
* the write lock here for other waiters
*/
- reiserfs_write_unlock(inode->i_sb);
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
if (!dir_emit
(ctx, local_buf, d_reclen, d_ino,
DT_UNKNOWN)) {
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
if (local_buf != small_buf) {
kfree(local_buf);
}
goto end;
}
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
if (local_buf != small_buf) {
kfree(local_buf);
}
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 430e0658704c..dc4d41530316 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -1022,9 +1022,9 @@ static int get_far_parent(struct tree_balance *tb,
if (buffer_locked(*pcom_father)) {
/* Release the write lock while the buffer is busy */
- reiserfs_write_unlock(tb->tb_sb);
+ int depth = reiserfs_write_unlock_nested(tb->tb_sb);
__wait_on_buffer(*pcom_father);
- reiserfs_write_lock(tb->tb_sb);
+ reiserfs_write_lock_nested(tb->tb_sb, depth);
if (FILESYSTEM_CHANGED_TB(tb)) {
brelse(*pcom_father);
return REPEAT_SEARCH;
@@ -1929,9 +1929,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
return REPEAT_SEARCH;
if (buffer_locked(bh)) {
- reiserfs_write_unlock(tb->tb_sb);
+ int depth = reiserfs_write_unlock_nested(tb->tb_sb);
__wait_on_buffer(bh);
- reiserfs_write_lock(tb->tb_sb);
+ reiserfs_write_lock_nested(tb->tb_sb, depth);
if (FILESYSTEM_CHANGED_TB(tb))
return REPEAT_SEARCH;
}
@@ -1952,6 +1952,7 @@ static int get_neighbors(struct tree_balance *tb, int h)
unsigned long son_number;
struct super_block *sb = tb->tb_sb;
struct buffer_head *bh;
+ int depth;
PROC_INFO_INC(sb, get_neighbors[h]);
@@ -1969,9 +1970,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
FL[h]);
son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
- reiserfs_write_unlock(sb);
+ depth = reiserfs_write_unlock_nested(tb->tb_sb);
bh = sb_bread(sb, son_number);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(tb->tb_sb, depth);
if (!bh)
return IO_ERROR;
if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2009,9 +2010,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
child_position =
(bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
- reiserfs_write_unlock(sb);
+ depth = reiserfs_write_unlock_nested(tb->tb_sb);
bh = sb_bread(sb, son_number);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(tb->tb_sb, depth);
if (!bh)
return IO_ERROR;
if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2272,6 +2273,7 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
}
if (locked) {
+ int depth;
#ifdef CONFIG_REISERFS_CHECK
repeat_counter++;
if ((repeat_counter % 10000) == 0) {
@@ -2286,9 +2288,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
REPEAT_SEARCH : CARRY_ON;
}
#endif
- reiserfs_write_unlock(tb->tb_sb);
+ depth = reiserfs_write_unlock_nested(tb->tb_sb);
__wait_on_buffer(locked);
- reiserfs_write_lock(tb->tb_sb);
+ reiserfs_write_lock_nested(tb->tb_sb, depth);
if (FILESYSTEM_CHANGED_TB(tb))
return REPEAT_SEARCH;
}
@@ -2359,9 +2361,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
/* if it possible in indirect_to_direct conversion */
if (buffer_locked(tbS0)) {
- reiserfs_write_unlock(tb->tb_sb);
+ int depth = reiserfs_write_unlock_nested(tb->tb_sb);
__wait_on_buffer(tbS0);
- reiserfs_write_lock(tb->tb_sb);
+ reiserfs_write_lock_nested(tb->tb_sb, depth);
if (FILESYSTEM_CHANGED_TB(tb))
return REPEAT_SEARCH;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0048cc16a6a8..ad62bdbb451e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -30,7 +30,6 @@ void reiserfs_evict_inode(struct inode *inode)
JOURNAL_PER_BALANCE_CNT * 2 +
2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
struct reiserfs_transaction_handle th;
- int depth;
int err;
if (!inode->i_nlink && !is_bad_inode(inode))
@@ -40,12 +39,13 @@ void reiserfs_evict_inode(struct inode *inode)
if (inode->i_nlink)
goto no_delete;
- depth = reiserfs_write_lock_once(inode->i_sb);
-
/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
+
reiserfs_delete_xattrs(inode);
+ reiserfs_write_lock(inode->i_sb);
+
if (journal_begin(&th, inode->i_sb, jbegin_count))
goto out;
reiserfs_update_inode_transaction(inode);
@@ -57,8 +57,11 @@ void reiserfs_evict_inode(struct inode *inode)
/* Do quota update inside a transaction for journaled quotas. We must do that
* after delete_object so that quota updates go into the same transaction as
* stat data deletion */
- if (!err)
+ if (!err) {
+ int depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_free_inode(inode);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
+ }
if (journal_end(&th, inode->i_sb, jbegin_count))
goto out;
@@ -72,12 +75,12 @@ void reiserfs_evict_inode(struct inode *inode)
/* all items of file are deleted, so we can remove "save" link */
remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
* about an error here */
+out:
+ reiserfs_write_unlock(inode->i_sb);
} else {
/* no object items are in the tree */
;
}
- out:
- reiserfs_write_unlock_once(inode->i_sb, depth);
clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */
dquot_drop(inode);
inode->i_blocks = 0;
@@ -610,7 +613,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
__le32 *item;
int done;
int fs_gen;
- int lock_depth;
struct reiserfs_transaction_handle *th = NULL;
/* space reserved in transaction batch:
. 3 balancings in direct->indirect conversion
@@ -626,11 +628,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
loff_t new_offset =
(((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
- lock_depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
version = get_inode_item_key_version(inode);
if (!file_capable(inode, block)) {
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
return -EFBIG;
}
@@ -642,7 +644,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
/* find number of block-th logical block of the file */
ret = _get_block_create_0(inode, block, bh_result,
create | GET_BLOCK_READ_DIRECT);
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
return ret;
}
/*
@@ -760,7 +762,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
if (!dangle && th)
retval = reiserfs_end_persistent_transaction(th);
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
/* the item was found, so new blocks were not added to the file
** there is no need to make sure the inode is updated with this
@@ -1011,11 +1013,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
* long time. reschedule if needed and also release the write
* lock for others.
*/
- if (need_resched()) {
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
- schedule();
- lock_depth = reiserfs_write_lock_once(inode->i_sb);
- }
+ reiserfs_cond_resched(inode->i_sb);
retval = search_for_position_by_key(inode->i_sb, &key, &path);
if (retval == IO_ERROR) {
@@ -1050,7 +1048,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
retval = err;
}
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
reiserfs_check_path(&path);
return retval;
}
@@ -1509,14 +1507,15 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
{
struct inode *inode;
struct reiserfs_iget_args args;
+ int depth;
args.objectid = key->on_disk_key.k_objectid;
args.dirid = key->on_disk_key.k_dir_id;
- reiserfs_write_unlock(s);
+ depth = reiserfs_write_unlock_nested(s);
inode = iget5_locked(s, key->on_disk_key.k_objectid,
reiserfs_find_actor, reiserfs_init_locked_inode,
(void *)(&args));
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -1772,7 +1771,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
struct inode *inode,
struct reiserfs_security_handle *security)
{
- struct super_block *sb;
+ struct super_block *sb = dir->i_sb;
struct reiserfs_iget_args args;
INITIALIZE_PATH(path_to_key);
struct cpu_key key;
@@ -1780,12 +1779,13 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
struct stat_data sd;
int retval;
int err;
+ int depth;
BUG_ON(!th->t_trans_id);
- reiserfs_write_unlock(inode->i_sb);
+ depth = reiserfs_write_unlock_nested(sb);
err = dquot_alloc_inode(inode);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(sb, depth);
if (err)
goto out_end_trans;
if (!dir->i_nlink) {
@@ -1793,8 +1793,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_bad_inode;
}
- sb = dir->i_sb;
-
/* item head of new item */
ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
@@ -1812,10 +1810,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
- reiserfs_write_unlock(inode->i_sb);
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
err = insert_inode_locked4(inode, args.objectid,
reiserfs_find_actor, &args);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
if (err) {
err = -EINVAL;
goto out_bad_inode;
@@ -1941,7 +1939,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
}
if (reiserfs_posixacl(inode->i_sb)) {
+ reiserfs_write_unlock(inode->i_sb);
retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
+ reiserfs_write_lock(inode->i_sb);
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
@@ -1956,7 +1956,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
inode->i_flags |= S_PRIVATE;
if (security->name) {
+ reiserfs_write_unlock(inode->i_sb);
retval = reiserfs_security_write(th, inode, security);
+ reiserfs_write_lock(inode->i_sb);
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
@@ -1982,14 +1984,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
INODE_PKEY(inode)->k_objectid = 0;
/* Quota change must be inside a transaction for journaling */
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_free_inode(inode);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
out_end_trans:
journal_end(th, th->t_super, th->t_blocks_allocated);
- reiserfs_write_unlock(inode->i_sb);
/* Drop can be outside and it needs more credits so it's better to have it outside */
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_drop(inode);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
inode->i_flags |= S_NOQUOTA;
make_bad_inode(inode);
@@ -2103,9 +2107,8 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
int error;
struct buffer_head *bh = NULL;
int err2;
- int lock_depth;
- lock_depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
if (inode->i_size > 0) {
error = grab_tail_page(inode, &page, &bh);
@@ -2174,7 +2177,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
page_cache_release(page);
}
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
return 0;
out:
@@ -2183,7 +2186,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
page_cache_release(page);
}
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
return error;
}
@@ -2648,10 +2651,11 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
struct inode *inode = page->mapping->host;
int ret;
int old_ref = 0;
+ int depth;
- reiserfs_write_unlock(inode->i_sb);
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
reiserfs_wait_on_write_block(inode->i_sb);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
fix_tail_page_for_writing(page);
if (reiserfs_transaction_running(inode->i_sb)) {
@@ -2708,7 +2712,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
int update_sd = 0;
struct reiserfs_transaction_handle *th;
unsigned start;
- int lock_depth = 0;
bool locked = false;
if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
@@ -2737,7 +2740,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
*/
if (pos + copied > inode->i_size) {
struct reiserfs_transaction_handle myth;
- lock_depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
locked = true;
/* If the file have grown beyond the border where it
can have a tail, unmark it as needing a tail
@@ -2768,7 +2771,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
}
if (th) {
if (!locked) {
- lock_depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
locked = true;
}
if (!update_sd)
@@ -2780,7 +2783,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
out:
if (locked)
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
unlock_page(page);
page_cache_release(page);
@@ -2790,7 +2793,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
return ret == 0 ? copied : ret;
journal_error:
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
locked = false;
if (th) {
if (!update_sd)
@@ -2808,10 +2811,11 @@ int reiserfs_commit_write(struct file *f, struct page *page,
int ret = 0;
int update_sd = 0;
struct reiserfs_transaction_handle *th = NULL;
+ int depth;
- reiserfs_write_unlock(inode->i_sb);
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
reiserfs_wait_on_write_block(inode->i_sb);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
if (reiserfs_transaction_running(inode->i_sb)) {
th = current->journal_info;
@@ -3110,7 +3114,6 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
unsigned int ia_valid;
- int depth;
int error;
error = inode_change_ok(inode, attr);
@@ -3122,13 +3125,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
if (is_quota_modification(inode, attr))
dquot_initialize(inode);
- depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
if (attr->ia_valid & ATTR_SIZE) {
/* version 2 items will be caught by the s_maxbytes check
** done for us in vmtruncate
*/
if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
attr->ia_size > MAX_NON_LFS) {
+ reiserfs_write_unlock(inode->i_sb);
error = -EFBIG;
goto out;
}
@@ -3150,8 +3154,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
error = err;
}
- if (error)
+ if (error) {
+ reiserfs_write_unlock(inode->i_sb);
goto out;
+ }
/*
* file size is changed, ctime and mtime are
* to be updated
@@ -3159,6 +3165,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
}
}
+ reiserfs_write_unlock(inode->i_sb);
if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
@@ -3183,14 +3190,16 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
return error;
/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+ reiserfs_write_lock(inode->i_sb);
error = journal_begin(&th, inode->i_sb, jbegin_count);
+ reiserfs_write_unlock(inode->i_sb);
if (error)
goto out;
- reiserfs_write_unlock_once(inode->i_sb, depth);
error = dquot_transfer(inode, attr);
- depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
if (error) {
journal_end(&th, inode->i_sb, jbegin_count);
+ reiserfs_write_unlock(inode->i_sb);
goto out;
}
@@ -3202,17 +3211,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
inode->i_gid = attr->ia_gid;
mark_inode_dirty(inode);
error = journal_end(&th, inode->i_sb, jbegin_count);
+ reiserfs_write_unlock(inode->i_sb);
if (error)
goto out;
}
- /*
- * Relax the lock here, as it might truncate the
- * inode pages and wait for inode pages locks.
- * To release such page lock, the owner needs the
- * reiserfs lock
- */
- reiserfs_write_unlock_once(inode->i_sb, depth);
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
error = inode_newsize_ok(inode, attr->ia_size);
@@ -3226,16 +3229,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
setattr_copy(inode, attr);
mark_inode_dirty(inode);
}
- depth = reiserfs_write_lock_once(inode->i_sb);
if (!error && reiserfs_posixacl(inode->i_sb)) {
if (attr->ia_valid & ATTR_MODE)
error = reiserfs_acl_chmod(inode);
}
- out:
- reiserfs_write_unlock_once(inode->i_sb, depth);
-
+out:
return error;
}
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 15cb5fe6b425..946ccbf5b5a1 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -167,7 +167,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
int reiserfs_unpack(struct inode *inode, struct file *filp)
{
int retval = 0;
- int depth;
int index;
struct page *page;
struct address_space *mapping;
@@ -183,11 +182,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
return 0;
}
- depth = reiserfs_write_lock_once(inode->i_sb);
-
/* we need to make sure nobody is changing the file size beneath us */
reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
+
write_from = inode->i_size & (blocksize - 1);
/* if we are on a block boundary, we are already unpacked. */
if (write_from == 0) {
@@ -221,6 +220,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
out:
mutex_unlock(&inode->i_mutex);
- reiserfs_write_unlock_once(inode->i_sb, depth);
+ reiserfs_write_unlock(inode->i_sb);
return retval;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 742fdd4c209a..73feacc49b2e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -947,9 +947,11 @@ static int reiserfs_async_progress_wait(struct super_block *s)
struct reiserfs_journal *j = SB_JOURNAL(s);
if (atomic_read(&j->j_async_throttle)) {
- reiserfs_write_unlock(s);
+ int depth;
+
+ depth = reiserfs_write_unlock_nested(s);
congestion_wait(BLK_RW_ASYNC, HZ / 10);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
return 0;
@@ -972,6 +974,7 @@ static int flush_commit_list(struct super_block *s,
struct reiserfs_journal *journal = SB_JOURNAL(s);
int retval = 0;
int write_len;
+ int depth;
reiserfs_check_lock_depth(s, "flush_commit_list");
@@ -1018,12 +1021,12 @@ static int flush_commit_list(struct super_block *s,
* We might sleep in numerous places inside
* write_ordered_buffers. Relax the write lock.
*/
- reiserfs_write_unlock(s);
+ depth = reiserfs_write_unlock_nested(s);
ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
journal, jl, &jl->j_bh_list);
if (ret < 0 && retval == 0)
retval = ret;
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
BUG_ON(!list_empty(&jl->j_bh_list));
/*
@@ -1043,9 +1046,9 @@ static int flush_commit_list(struct super_block *s,
tbh = journal_find_get_block(s, bn);
if (tbh) {
if (buffer_dirty(tbh)) {
- reiserfs_write_unlock(s);
+ depth = reiserfs_write_unlock_nested(s);
ll_rw_block(WRITE, 1, &tbh);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
put_bh(tbh) ;
}
@@ -1057,17 +1060,17 @@ static int flush_commit_list(struct super_block *s,
(jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
tbh = journal_find_get_block(s, bn);
- reiserfs_write_unlock(s);
- wait_on_buffer(tbh);
- reiserfs_write_lock(s);
+ depth = reiserfs_write_unlock_nested(s);
+ __wait_on_buffer(tbh);
+ reiserfs_write_lock_nested(s, depth);
// since we're using ll_rw_blk above, it might have skipped over
// a locked buffer. Double check here
//
/* redundant, sync_dirty_buffer() checks */
if (buffer_dirty(tbh)) {
- reiserfs_write_unlock(s);
+ depth = reiserfs_write_unlock_nested(s);
sync_dirty_buffer(tbh);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
if (unlikely(!buffer_uptodate(tbh))) {
#ifdef CONFIG_REISERFS_CHECK
@@ -1091,12 +1094,12 @@ static int flush_commit_list(struct super_block *s,
if (buffer_dirty(jl->j_commit_bh))
BUG();
mark_buffer_dirty(jl->j_commit_bh) ;
- reiserfs_write_unlock(s);
+ depth = reiserfs_write_unlock_nested(s);
if (reiserfs_barrier_flush(s))
__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
else
sync_dirty_buffer(jl->j_commit_bh);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
/* If there was a write error in the journal - we can't commit this
@@ -1228,15 +1231,16 @@ static int _update_journal_header_block(struct super_block *sb,
{
struct reiserfs_journal_header *jh;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
+ int depth;
if (reiserfs_is_journal_aborted(journal))
return -EIO;
if (trans_id >= journal->j_last_flush_trans_id) {
if (buffer_locked((journal->j_header_bh))) {
- reiserfs_write_unlock(sb);
- wait_on_buffer((journal->j_header_bh));
- reiserfs_write_lock(sb);
+ depth = reiserfs_write_unlock_nested(sb);
+ __wait_on_buffer(journal->j_header_bh);
+ reiserfs_write_lock_nested(sb, depth);
if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
#ifdef CONFIG_REISERFS_CHECK
reiserfs_warning(sb, "journal-699",
@@ -1254,14 +1258,14 @@ static int _update_journal_header_block(struct super_block *sb,
jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
set_buffer_dirty(journal->j_header_bh);
- reiserfs_write_unlock(sb);
+ depth = reiserfs_write_unlock_nested(sb);
if (reiserfs_barrier_flush(sb))
__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
else
sync_dirty_buffer(journal->j_header_bh);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(sb, depth);
if (!buffer_uptodate(journal->j_header_bh)) {
reiserfs_warning(sb, "journal-837",
"IO error during journal replay");
@@ -1341,6 +1345,7 @@ static int flush_journal_list(struct super_block *s,
unsigned long j_len_saved = jl->j_len;
struct reiserfs_journal *journal = SB_JOURNAL(s);
int err = 0;
+ int depth;
BUG_ON(j_len_saved <= 0);
@@ -1495,9 +1500,9 @@ static int flush_journal_list(struct super_block *s,
"cn->bh is NULL");
}
- reiserfs_write_unlock(s);
- wait_on_buffer(cn->bh);
- reiserfs_write_lock(s);
+ depth = reiserfs_write_unlock_nested(s);
+ __wait_on_buffer(cn->bh);
+ reiserfs_write_lock_nested(s, depth);
if (!cn->bh) {
reiserfs_panic(s, "journal-1012",
@@ -1974,6 +1979,7 @@ static int journal_compare_desc_commit(struct super_block *sb,
/* returns 0 if it did not find a description block
** returns -1 if it found a corrupt commit block
** returns 1 if both desc and commit were valid
+** NOTE: only called during fs mount
*/
static int journal_transaction_is_valid(struct super_block *sb,
struct buffer_head *d_bh,
@@ -2073,8 +2079,9 @@ static void brelse_array(struct buffer_head **heads, int num)
/*
** given the start, and values for the oldest acceptable transactions,
-** this either reads in a replays a transaction, or returns because the transaction
-** is invalid, or too old.
+** this either reads in a replays a transaction, or returns because the
+** transaction is invalid, or too old.
+** NOTE: only called during fs mount
*/
static int journal_read_transaction(struct super_block *sb,
unsigned long cur_dblock,
@@ -2208,10 +2215,7 @@ static int journal_read_transaction(struct super_block *sb,
ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
for (i = 0; i < get_desc_trans_len(desc); i++) {
- reiserfs_write_unlock(sb);
wait_on_buffer(log_blocks[i]);
- reiserfs_write_lock(sb);
-
if (!buffer_uptodate(log_blocks[i])) {
reiserfs_warning(sb, "journal-1212",
"REPLAY FAILURE fsck required! "
@@ -2318,12 +2322,13 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
/*
** read and replay the log
-** on a clean unmount, the journal header's next unflushed pointer will be to an invalid
-** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast.
-**
-** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.
-**
+** on a clean unmount, the journal header's next unflushed pointer will
+** be to an invalid transaction. This tests that before finding all the
+** transactions in the log, which makes normal mount times fast.
+** After a crash, this starts with the next unflushed transaction, and
+** replays until it finds one too old, or invalid.
** On exit, it sets things up so the first transaction will work correctly.
+** NOTE: only called during fs mount
*/
static int journal_read(struct super_block *sb)
{
@@ -2501,14 +2506,18 @@ static int journal_read(struct super_block *sb)
"replayed %d transactions in %lu seconds\n",
replay_count, get_seconds() - start);
}
+ /* needed to satisfy the locking in _update_journal_header_block */
+ reiserfs_write_lock(sb);
if (!bdev_read_only(sb->s_bdev) &&
_update_journal_header_block(sb, journal->j_start,
journal->j_last_flush_trans_id)) {
+ reiserfs_write_unlock(sb);
/* replay failed, caller must call free_journal_ram and abort
** the mount
*/
return -1;
}
+ reiserfs_write_unlock(sb);
return 0;
}
@@ -2828,13 +2837,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
goto free_and_return;
}
- /*
- * Journal_read needs to be inspected in order to push down
- * the lock further inside (or even remove it).
- */
- reiserfs_write_lock(sb);
ret = journal_read(sb);
- reiserfs_write_unlock(sb);
if (ret < 0) {
reiserfs_warning(sb, "reiserfs-2006",
"Replay Failure, unable to mount");
@@ -2923,9 +2926,9 @@ static void queue_log_writer(struct super_block *s)
add_wait_queue(&journal->j_join_wait, &wait);
set_current_state(TASK_UNINTERRUPTIBLE);
if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
- reiserfs_write_unlock(s);
+ int depth = reiserfs_write_unlock_nested(s);
schedule();
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
__set_current_state(TASK_RUNNING);
remove_wait_queue(&journal->j_join_wait, &wait);
@@ -2943,9 +2946,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
struct reiserfs_journal *journal = SB_JOURNAL(sb);
unsigned long bcount = journal->j_bcount;
while (1) {
- reiserfs_write_unlock(sb);
+ int depth;
+
+ depth = reiserfs_write_unlock_nested(sb);
schedule_timeout_uninterruptible(1);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(sb, depth);
+
journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
while ((atomic_read(&journal->j_wcount) > 0 ||
atomic_read(&journal->j_jlock)) &&
@@ -2976,6 +2982,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
struct reiserfs_transaction_handle myth;
int sched_count = 0;
int retval;
+ int depth;
reiserfs_check_lock_depth(sb, "journal_begin");
BUG_ON(nblocks > journal->j_trans_max);
@@ -2996,9 +3003,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
unlock_journal(sb);
- reiserfs_write_unlock(sb);
+ depth = reiserfs_write_unlock_nested(sb);
reiserfs_wait_on_write_block(sb);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(sb, depth);
PROC_INFO_INC(sb, journal.journal_relock_writers);
goto relock;
}
@@ -3821,6 +3828,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb,
if (test_clear_buffer_journal_restore_dirty(bh) &&
buffer_journal_dirty(bh)) {
struct reiserfs_journal_cnode *cn;
+ reiserfs_write_lock(sb);
cn = get_journal_hash_dev(sb,
journal->j_list_hash_table,
bh->b_blocknr);
@@ -3828,6 +3836,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb,
set_buffer_journal_test(bh);
mark_buffer_dirty(bh);
}
+ reiserfs_write_unlock(sb);
}
clear_buffer_journal_prepared(bh);
}
@@ -3911,6 +3920,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
unsigned long jindex;
unsigned int commit_trans_id;
int trans_half;
+ int depth;
BUG_ON(th->t_refcount > 1);
BUG_ON(!th->t_trans_id);
@@ -4116,9 +4126,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
next = cn->next;
free_cnode(sb, cn);
cn = next;
- reiserfs_write_unlock(sb);
- cond_resched();
- reiserfs_write_lock(sb);
+ reiserfs_cond_resched(sb);
}
/* we are done with both the c_bh and d_bh, but
@@ -4165,10 +4173,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
* is lost.
*/
if (!list_empty(&jl->j_tail_bh_list)) {
- reiserfs_write_unlock(sb);
+ depth = reiserfs_write_unlock_nested(sb);
write_ordered_buffers(&journal->j_dirty_buffers_lock,
journal, jl, &jl->j_tail_bh_list);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(sb, depth);
}
BUG_ON(!list_empty(&jl->j_tail_bh_list));
mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index d735bc8470e3..045b83ef9fd9 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -48,30 +48,35 @@ void reiserfs_write_unlock(struct super_block *s)
}
}
-/*
- * If we already own the lock, just exit and don't increase the depth.
- * Useful when we don't want to lock more than once.
- *
- * We always return the lock_depth we had before calling
- * this function.
- */
-int reiserfs_write_lock_once(struct super_block *s)
+int __must_check reiserfs_write_unlock_nested(struct super_block *s)
{
struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+ int depth;
- if (sb_i->lock_owner != current) {
- mutex_lock(&sb_i->lock);
- sb_i->lock_owner = current;
- return sb_i->lock_depth++;
- }
+ /* this can happen when the lock isn't always held */
+ if (sb_i->lock_owner != current)
+ return -1;
+
+ depth = sb_i->lock_depth;
+
+ sb_i->lock_depth = -1;
+ sb_i->lock_owner = NULL;
+ mutex_unlock(&sb_i->lock);
- return sb_i->lock_depth;
+ return depth;
}
-void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
+void reiserfs_write_lock_nested(struct super_block *s, int depth)
{
- if (lock_depth == -1)
- reiserfs_write_unlock(s);
+ struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
+
+ /* this can happen when the lock isn't always held */
+ if (depth == -1)
+ return;
+
+ mutex_lock(&sb_i->lock);
+ sb_i->lock_owner = current;
+ sb_i->lock_depth = depth;
}
/*
@@ -82,9 +87,7 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
{
struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
- if (sb_i->lock_depth < 0)
- reiserfs_panic(sb, "%s called without kernel lock held %d",
- caller);
+ WARN_ON(sb_i->lock_depth < 0);
}
#ifdef CONFIG_REISERFS_CHECK
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 8567fb847601..dc5236f6de1b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -325,7 +325,6 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
int retval;
- int lock_depth;
struct inode *inode = NULL;
struct reiserfs_dir_entry de;
INITIALIZE_PATH(path_to_entry);
@@ -333,12 +332,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
return ERR_PTR(-ENAMETOOLONG);
- /*
- * Might be called with or without the write lock, must be careful
- * to not recursively hold it in case we want to release the lock
- * before rescheduling.
- */
- lock_depth = reiserfs_write_lock_once(dir->i_sb);
+ reiserfs_write_lock(dir->i_sb);
de.de_gen_number_bit_string = NULL;
retval =
@@ -349,7 +343,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
inode = reiserfs_iget(dir->i_sb,
(struct cpu_key *)&(de.de_dir_id));
if (!inode || IS_ERR(inode)) {
- reiserfs_write_unlock_once(dir->i_sb, lock_depth);
+ reiserfs_write_unlock(dir->i_sb);
return ERR_PTR(-EACCES);
}
@@ -358,7 +352,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
if (IS_PRIVATE(dir))
inode->i_flags |= S_PRIVATE;
}
- reiserfs_write_unlock_once(dir->i_sb, lock_depth);
+ reiserfs_write_unlock(dir->i_sb);
if (retval == IO_ERROR) {
return ERR_PTR(-EIO);
}
@@ -727,7 +721,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct inode *inode;
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
- int lock_depth;
/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
@@ -753,7 +746,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
return retval;
}
jbegin_count += retval;
- lock_depth = reiserfs_write_lock_once(dir->i_sb);
+ reiserfs_write_lock(dir->i_sb);
retval = journal_begin(&th, dir->i_sb, jbegin_count);
if (retval) {
@@ -804,7 +797,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
d_instantiate(dentry, inode);
retval = journal_end(&th, dir->i_sb, jbegin_count);
out_failed:
- reiserfs_write_unlock_once(dir->i_sb, lock_depth);
+ reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -920,7 +913,6 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
struct reiserfs_transaction_handle th;
int jbegin_count;
unsigned long savelink;
- int depth;
dquot_initialize(dir);
@@ -934,7 +926,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
- depth = reiserfs_write_lock_once(dir->i_sb);
+ reiserfs_write_lock(dir->i_sb);
retval = journal_begin(&th, dir->i_sb, jbegin_count);
if (retval)
goto out_unlink;
@@ -995,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
retval = journal_end(&th, dir->i_sb, jbegin_count);
reiserfs_check_path(&path);
- reiserfs_write_unlock_once(dir->i_sb, depth);
+ reiserfs_write_unlock(dir->i_sb);
return retval;
end_unlink:
@@ -1005,7 +997,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
if (err)
retval = err;
out_unlink:
- reiserfs_write_unlock_once(dir->i_sb, depth);
+ reiserfs_write_unlock(dir->i_sb);
return retval;
}
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index c0b1112ab7e3..54944d5a4a6e 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -358,12 +358,13 @@ void __reiserfs_panic(struct super_block *sb, const char *id,
dump_stack();
#endif
if (sb)
- panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
+ printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
sb->s_id, id ? id : "", id ? " " : "",
function, error_buf);
else
- panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
+ printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
id ? id : "", id ? " " : "", function, error_buf);
+ BUG();
}
void __reiserfs_error(struct super_block *sb, const char *id,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 33532f79b4f7..a958444a75fc 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -19,12 +19,13 @@
/*
* LOCKING:
*
- * We rely on new Alexander Viro's super-block locking.
+ * These guys are evicted from procfs as the very first step in ->kill_sb().
*
*/
-static int show_version(struct seq_file *m, struct super_block *sb)
+static int show_version(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
char *format;
if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
@@ -66,8 +67,9 @@ static int show_version(struct seq_file *m, struct super_block *sb)
#define DJP( x ) le32_to_cpu( jp -> x )
#define JF( x ) ( r -> s_journal -> x )
-static int show_super(struct seq_file *m, struct super_block *sb)
+static int show_super(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
seq_printf(m, "state: \t%s\n"
@@ -128,8 +130,9 @@ static int show_super(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_per_level(struct seq_file *m, struct super_block *sb)
+static int show_per_level(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
int level;
@@ -186,8 +189,9 @@ static int show_per_level(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_bitmap(struct seq_file *m, struct super_block *sb)
+static int show_bitmap(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
seq_printf(m, "free_block: %lu\n"
@@ -218,8 +222,9 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
+static int show_on_disk_super(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
struct reiserfs_super_block *rs = sb_info->s_rs;
int hash_code = DFL(s_hash_function_code);
@@ -261,8 +266,9 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_oidmap(struct seq_file *m, struct super_block *sb)
+static int show_oidmap(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
struct reiserfs_super_block *rs = sb_info->s_rs;
unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
@@ -291,8 +297,9 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb)
return 0;
}
-static int show_journal(struct seq_file *m, struct super_block *sb)
+static int show_journal(struct seq_file *m, void *unused)
{
+ struct super_block *sb = m->private;
struct reiserfs_sb_info *r = REISERFS_SB(sb);
struct reiserfs_super_block *rs = r->s_rs;
struct journal_params *jp = &rs->s_v1.s_journal;
@@ -383,92 +390,24 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
return 0;
}
-/* iterator */
-static int test_sb(struct super_block *sb, void *data)
-{
- return data == sb;
-}
-
-static int set_sb(struct super_block *sb, void *data)
-{
- return -ENOENT;
-}
-
-struct reiserfs_seq_private {
- struct super_block *sb;
- int (*show) (struct seq_file *, struct super_block *);
-};
-
-static void *r_start(struct seq_file *m, loff_t * pos)
-{
- struct reiserfs_seq_private *priv = m->private;
- loff_t l = *pos;
-
- if (l)
- return NULL;
-
- if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb)))
- return NULL;
-
- up_write(&priv->sb->s_umount);
- return priv->sb;
-}
-
-static void *r_next(struct seq_file *m, void *v, loff_t * pos)
-{
- ++*pos;
- if (v)
- deactivate_super(v);
- return NULL;
-}
-
-static void r_stop(struct seq_file *m, void *v)
-{
- if (v)
- deactivate_super(v);
-}
-
-static int r_show(struct seq_file *m, void *v)
-{
- struct reiserfs_seq_private *priv = m->private;
- return priv->show(m, v);
-}
-
-static const struct seq_operations r_ops = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = r_show,
-};
-
static int r_open(struct inode *inode, struct file *file)
{
- struct reiserfs_seq_private *priv;
- int ret = seq_open_private(file, &r_ops,
- sizeof(struct reiserfs_seq_private));
-
- if (!ret) {
- struct seq_file *m = file->private_data;
- priv = m->private;
- priv->sb = proc_get_parent_data(inode);
- priv->show = PDE_DATA(inode);
- }
- return ret;
+ return single_open(file, PDE_DATA(inode),
+ proc_get_parent_data(inode));
}
static const struct file_operations r_file_operations = {
.open = r_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
- .owner = THIS_MODULE,
+ .release = single_release,
};
static struct proc_dir_entry *proc_info_root = NULL;
static const char proc_info_root_name[] = "fs/reiserfs";
static void add_file(struct super_block *sb, char *name,
- int (*func) (struct seq_file *, struct super_block *))
+ int (*func) (struct seq_file *, void *))
{
proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
&r_file_operations, func);
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 3df5ce6c724d..f8adaee537c2 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -630,8 +630,8 @@ static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
*/
void reiserfs_write_lock(struct super_block *s);
void reiserfs_write_unlock(struct super_block *s);
-int reiserfs_write_lock_once(struct super_block *s);
-void reiserfs_write_unlock_once(struct super_block *s, int lock_depth);
+int __must_check reiserfs_write_unlock_nested(struct super_block *s);
+void reiserfs_write_lock_nested(struct super_block *s, int depth);
#ifdef CONFIG_REISERFS_CHECK
void reiserfs_lock_check_recursive(struct super_block *s);
@@ -667,31 +667,33 @@ static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
* - The inode mutex
*/
static inline void reiserfs_mutex_lock_safe(struct mutex *m,
- struct super_block *s)
+ struct super_block *s)
{
- reiserfs_lock_check_recursive(s);
- reiserfs_write_unlock(s);
+ int depth;
+
+ depth = reiserfs_write_unlock_nested(s);
mutex_lock(m);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
static inline void
reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
- struct super_block *s)
+ struct super_block *s)
{
- reiserfs_lock_check_recursive(s);
- reiserfs_write_unlock(s);
+ int depth;
+
+ depth = reiserfs_write_unlock_nested(s);
mutex_lock_nested(m, subclass);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
static inline void
reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
{
- reiserfs_lock_check_recursive(s);
- reiserfs_write_unlock(s);
- down_read(sem);
- reiserfs_write_lock(s);
+ int depth;
+ depth = reiserfs_write_unlock_nested(s);
+ down_read(sem);
+ reiserfs_write_lock_nested(s, depth);
}
/*
@@ -701,9 +703,11 @@ reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
static inline void reiserfs_cond_resched(struct super_block *s)
{
if (need_resched()) {
- reiserfs_write_unlock(s);
+ int depth;
+
+ depth = reiserfs_write_unlock_nested(s);
schedule();
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
}
}
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 3ce02cff5e90..a4ef5cd606eb 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -34,6 +34,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
unsigned long int block_count, free_blocks;
int i;
int copy_size;
+ int depth;
sb = SB_DISK_SUPER_BLOCK(s);
@@ -43,7 +44,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
}
/* check the device size */
+ depth = reiserfs_write_unlock_nested(s);
bh = sb_bread(s, block_count_new - 1);
+ reiserfs_write_lock_nested(s, depth);
if (!bh) {
printk("reiserfs_resize: can\'t read last block\n");
return -EINVAL;
@@ -125,9 +128,12 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
* transaction begins, and the new bitmaps don't matter if the
* transaction fails. */
for (i = bmap_nr; i < bmap_nr_new; i++) {
+ int depth;
/* don't use read_bitmap_block since it will cache
* the uninitialized bitmap */
+ depth = reiserfs_write_unlock_nested(s);
bh = sb_bread(s, i * s->s_blocksize * 8);
+ reiserfs_write_lock_nested(s, depth);
if (!bh) {
vfree(bitmap);
return -EIO;
@@ -138,9 +144,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
- reiserfs_write_unlock(s);
+ depth = reiserfs_write_unlock_nested(s);
sync_dirty_buffer(bh);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(s, depth);
// update bitmap_info stuff
bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 2f40a4c70a4d..b14706a05d52 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -524,14 +524,14 @@ static int is_tree_node(struct buffer_head *bh, int level)
* the caller (search_by_key) will perform other schedule-unsafe
* operations just after calling this function.
*
- * @return true if we have unlocked
+ * @return depth of lock to be restored after read completes
*/
-static bool search_by_key_reada(struct super_block *s,
+static int search_by_key_reada(struct super_block *s,
struct buffer_head **bh,
b_blocknr_t *b, int num)
{
int i, j;
- bool unlocked = false;
+ int depth = -1;
for (i = 0; i < num; i++) {
bh[i] = sb_getblk(s, b[i]);
@@ -549,15 +549,13 @@ static bool search_by_key_reada(struct super_block *s,
* you have to make sure the prepared bit isn't set on this buffer
*/
if (!buffer_uptodate(bh[j])) {
- if (!unlocked) {
- reiserfs_write_unlock(s);
- unlocked = true;
- }
+ if (depth == -1)
+ depth = reiserfs_write_unlock_nested(s);
ll_rw_block(READA, 1, bh + j);
}
brelse(bh[j]);
}
- return unlocked;
+ return depth;
}
/**************************************************************************
@@ -645,26 +643,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
have a pointer to it. */
if ((bh = last_element->pe_buffer =
sb_getblk(sb, block_number))) {
- bool unlocked = false;
- if (!buffer_uptodate(bh) && reada_count > 1)
- /* may unlock the write lock */
- unlocked = search_by_key_reada(sb, reada_bh,
- reada_blocks, reada_count);
/*
- * If we haven't already unlocked the write lock,
- * then we need to do that here before reading
- * the current block
+ * We'll need to drop the lock if we encounter any
+ * buffers that need to be read. If all of them are
+ * already up to date, we don't need to drop the lock.
*/
- if (!buffer_uptodate(bh) && !unlocked) {
- reiserfs_write_unlock(sb);
- unlocked = true;
- }
+ int depth = -1;
+
+ if (!buffer_uptodate(bh) && reada_count > 1)
+ depth = search_by_key_reada(sb, reada_bh,
+ reada_blocks, reada_count);
+
+ if (!buffer_uptodate(bh) && depth == -1)
+ depth = reiserfs_write_unlock_nested(sb);
+
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
- if (unlocked)
- reiserfs_write_lock(sb);
+ if (depth != -1)
+ reiserfs_write_lock_nested(sb, depth);
if (!buffer_uptodate(bh))
goto io_error;
} else {
@@ -1059,9 +1057,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
reiserfs_free_block(th, inode, block, 1);
}
- reiserfs_write_unlock(sb);
- cond_resched();
- reiserfs_write_lock(sb);
+ reiserfs_cond_resched(sb);
if (item_moved (&s_ih, path)) {
need_re_search = 1;
@@ -1190,6 +1186,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
struct item_head *q_ih;
int quota_cut_bytes;
int ret_value, del_size, removed;
+ int depth;
#ifdef CONFIG_REISERFS_CHECK
char mode;
@@ -1299,7 +1296,9 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
"reiserquota delete_item(): freeing %u, id=%u type=%c",
quota_cut_bytes, inode->i_uid, head2type(&s_ih));
#endif
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_free_space_nodirty(inode, quota_cut_bytes);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
/* Return deleted body length */
return ret_value;
@@ -1325,6 +1324,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
struct inode *inode, struct reiserfs_key *key)
{
+ struct super_block *sb = th->t_super;
struct tree_balance tb;
INITIALIZE_PATH(path);
int item_len = 0;
@@ -1377,14 +1377,17 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
if (retval == CARRY_ON) {
do_balance(&tb, NULL, NULL, M_DELETE);
if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */
+ int depth;
#ifdef REISERQUOTA_DEBUG
reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
"reiserquota delete_solid_item(): freeing %u id=%u type=%c",
quota_cut_bytes, inode->i_uid,
key2type(key));
#endif
+ depth = reiserfs_write_unlock_nested(sb);
dquot_free_space_nodirty(inode,
quota_cut_bytes);
+ reiserfs_write_lock_nested(sb, depth);
}
break;
}
@@ -1561,6 +1564,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
int retval2 = -1;
int quota_cut_bytes;
loff_t tail_pos = 0;
+ int depth;
BUG_ON(!th->t_trans_id);
@@ -1733,7 +1737,9 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
"reiserquota cut_from_item(): freeing %u id=%u type=%c",
quota_cut_bytes, inode->i_uid, '?');
#endif
+ depth = reiserfs_write_unlock_nested(sb);
dquot_free_space_nodirty(inode, quota_cut_bytes);
+ reiserfs_write_lock_nested(sb, depth);
return ret_value;
}
@@ -1953,9 +1959,11 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
const char *body, /* Pointer to the bytes to paste. */
int pasted_size)
{ /* Size of pasted bytes. */
+ struct super_block *sb = inode->i_sb;
struct tree_balance s_paste_balance;
int retval;
int fs_gen;
+ int depth;
BUG_ON(!th->t_trans_id);
@@ -1968,9 +1976,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
key2type(&(key->on_disk_key)));
#endif
- reiserfs_write_unlock(inode->i_sb);
+ depth = reiserfs_write_unlock_nested(sb);
retval = dquot_alloc_space_nodirty(inode, pasted_size);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(sb, depth);
if (retval) {
pathrelse(search_path);
return retval;
@@ -2027,7 +2035,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
pasted_size, inode->i_uid,
key2type(&(key->on_disk_key)));
#endif
+ depth = reiserfs_write_unlock_nested(sb);
dquot_free_space_nodirty(inode, pasted_size);
+ reiserfs_write_lock_nested(sb, depth);
return retval;
}
@@ -2050,6 +2060,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
if (inode) { /* Do we count quotas for item? */
+ int depth;
fs_gen = get_generation(inode->i_sb);
quota_bytes = ih_item_len(ih);
@@ -2063,11 +2074,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
"reiserquota insert_item(): allocating %u id=%u type=%c",
quota_bytes, inode->i_uid, head2type(ih));
#endif
- reiserfs_write_unlock(inode->i_sb);
/* We can't dirty inode here. It would be immediately written but
* appropriate stat item isn't inserted yet... */
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
retval = dquot_alloc_space_nodirty(inode, quota_bytes);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
if (retval) {
pathrelse(path);
return retval;
@@ -2118,7 +2129,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
"reiserquota insert_item(): freeing %u id=%u type=%c",
quota_bytes, inode->i_uid, head2type(ih));
#endif
- if (inode)
+ if (inode) {
+ int depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_free_space_nodirty(inode, quota_bytes);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
+ }
return retval;
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f8a23c3078f8..3ead145dadc4 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -243,6 +243,7 @@ static int finish_unfinished(struct super_block *s)
done = 0;
REISERFS_SB(s)->s_is_unlinked_ok = 1;
while (!retval) {
+ int depth;
retval = search_item(s, &max_cpu_key, &path);
if (retval != ITEM_NOT_FOUND) {
reiserfs_error(s, "vs-2140",
@@ -298,9 +299,9 @@ static int finish_unfinished(struct super_block *s)
retval = remove_save_link_only(s, &save_link_key, 0);
continue;
}
- reiserfs_write_unlock(s);
+ depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_initialize(inode);
- reiserfs_write_lock(s);
+ reiserfs_write_lock_nested(inode->i_sb, depth);
if (truncate && S_ISDIR(inode->i_mode)) {
/* We got a truncate request for a dir which is impossible.
@@ -356,10 +357,12 @@ static int finish_unfinished(struct super_block *s)
#ifdef CONFIG_QUOTA
/* Turn quotas off */
+ reiserfs_write_unlock(s);
for (i = 0; i < MAXQUOTAS; i++) {
if (sb_dqopt(s)->files[i] && quota_enabled[i])
dquot_quota_off(s, i);
}
+ reiserfs_write_lock(s);
if (ms_active_set)
/* Restore the flag back */
s->s_flags &= ~MS_ACTIVE;
@@ -499,6 +502,7 @@ int remove_save_link(struct inode *inode, int truncate)
static void reiserfs_kill_sb(struct super_block *s)
{
if (REISERFS_SB(s)) {
+ reiserfs_proc_info_done(s);
/*
* Force any pending inode evictions to occur now. Any
* inodes to be removed that have extended attributes
@@ -554,8 +558,6 @@ static void reiserfs_put_super(struct super_block *s)
REISERFS_SB(s)->reserved_blocks);
}
- reiserfs_proc_info_done(s);
-
reiserfs_write_unlock(s);
mutex_destroy(&REISERFS_SB(s)->lock);
kfree(s->s_fs_info);
@@ -624,7 +626,6 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
struct reiserfs_transaction_handle th;
int err = 0;
- int lock_depth;
if (inode->i_sb->s_flags & MS_RDONLY) {
reiserfs_warning(inode->i_sb, "clm-6006",
@@ -632,7 +633,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
inode->i_ino);
return;
}
- lock_depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
/* this is really only used for atime updates, so they don't have
** to be included in O_SYNC or fsync
@@ -645,7 +646,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
journal_end(&th, inode->i_sb, 1);
out:
- reiserfs_write_unlock_once(inode->i_sb, lock_depth);
+ reiserfs_write_unlock(inode->i_sb);
}
static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
@@ -1335,7 +1336,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
kfree(qf_names[i]);
#endif
err = -EINVAL;
- goto out_unlock;
+ goto out_err_unlock;
}
#ifdef CONFIG_QUOTA
handle_quota_files(s, qf_names, &qfmt);
@@ -1379,35 +1380,32 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
if (blocks) {
err = reiserfs_resize(s, blocks);
if (err != 0)
- goto out_unlock;
+ goto out_err_unlock;
}
if (*mount_flags & MS_RDONLY) {
+ reiserfs_write_unlock(s);
reiserfs_xattr_init(s, *mount_flags);
/* remount read-only */
if (s->s_flags & MS_RDONLY)
/* it is read-only already */
- goto out_ok;
+ goto out_ok_unlocked;
- /*
- * Drop write lock. Quota will retake it when needed and lock
- * ordering requires calling dquot_suspend() without it.
- */
- reiserfs_write_unlock(s);
err = dquot_suspend(s, -1);
if (err < 0)
goto out_err;
- reiserfs_write_lock(s);
/* try to remount file system with read-only permissions */
if (sb_umount_state(rs) == REISERFS_VALID_FS
|| REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
- goto out_ok;
+ goto out_ok_unlocked;
}
+ reiserfs_write_lock(s);
+
err = journal_begin(&th, s, 10);
if (err)
- goto out_unlock;
+ goto out_err_unlock;
/* Mounting a rw partition read-only. */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1416,13 +1414,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
} else {
/* remount read-write */
if (!(s->s_flags & MS_RDONLY)) {
+ reiserfs_write_unlock(s);
reiserfs_xattr_init(s, *mount_flags);
- goto out_ok; /* We are read-write already */
+ goto out_ok_unlocked; /* We are read-write already */
}
if (reiserfs_is_journal_aborted(journal)) {
err = journal->j_errno;
- goto out_unlock;
+ goto out_err_unlock;
}
handle_data_mode(s, mount_options);
@@ -1431,7 +1430,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */
err = journal_begin(&th, s, 10);
if (err)
- goto out_unlock;
+ goto out_err_unlock;
/* Mount a partition which is read-only, read-write */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1448,26 +1447,22 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
SB_JOURNAL(s)->j_must_wait = 1;
err = journal_end(&th, s, 10);
if (err)
- goto out_unlock;
+ goto out_err_unlock;
+ reiserfs_write_unlock(s);
if (!(*mount_flags & MS_RDONLY)) {
- /*
- * Drop write lock. Quota will retake it when needed and lock
- * ordering requires calling dquot_resume() without it.
- */
- reiserfs_write_unlock(s);
dquot_resume(s, -1);
reiserfs_write_lock(s);
finish_unfinished(s);
+ reiserfs_write_unlock(s);
reiserfs_xattr_init(s, *mount_flags);
}
-out_ok:
+out_ok_unlocked:
replace_mount_options(s, new_opts);
- reiserfs_write_unlock(s);
return 0;
-out_unlock:
+out_err_unlock:
reiserfs_write_unlock(s);
out_err:
kfree(new_opts);
@@ -2014,12 +2009,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
goto error;
}
+ reiserfs_write_unlock(s);
if ((errval = reiserfs_lookup_privroot(s)) ||
(errval = reiserfs_xattr_init(s, s->s_flags))) {
dput(s->s_root);
s->s_root = NULL;
- goto error;
+ goto error_unlocked;
}
+ reiserfs_write_lock(s);
/* look for files which were to be removed in previous session */
finish_unfinished(s);
@@ -2028,12 +2025,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
reiserfs_info(s, "using 3.5.x disk format\n");
}
+ reiserfs_write_unlock(s);
if ((errval = reiserfs_lookup_privroot(s)) ||
(errval = reiserfs_xattr_init(s, s->s_flags))) {
dput(s->s_root);
s->s_root = NULL;
- goto error;
+ goto error_unlocked;
}
+ reiserfs_write_lock(s);
}
// mark hash in super block: it could be unset. overwrite should be ok
set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
@@ -2101,6 +2100,7 @@ static int reiserfs_write_dquot(struct dquot *dquot)
{
struct reiserfs_transaction_handle th;
int ret, err;
+ int depth;
reiserfs_write_lock(dquot->dq_sb);
ret =
@@ -2108,9 +2108,9 @@ static int reiserfs_write_dquot(struct dquot *dquot)
REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (ret)
goto out;
- reiserfs_write_unlock(dquot->dq_sb);
+ depth = reiserfs_write_unlock_nested(dquot->dq_sb);
ret = dquot_commit(dquot);
- reiserfs_write_lock(dquot->dq_sb);
+ reiserfs_write_lock_nested(dquot->dq_sb, depth);
err =
journal_end(&th, dquot->dq_sb,
REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
@@ -2125,6 +2125,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
{
struct reiserfs_transaction_handle th;
int ret, err;
+ int depth;
reiserfs_write_lock(dquot->dq_sb);
ret =
@@ -2132,9 +2133,9 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (ret)
goto out;
- reiserfs_write_unlock(dquot->dq_sb);
+ depth = reiserfs_write_unlock_nested(dquot->dq_sb);
ret = dquot_acquire(dquot);
- reiserfs_write_lock(dquot->dq_sb);
+ reiserfs_write_lock_nested(dquot->dq_sb, depth);
err =
journal_end(&th, dquot->dq_sb,
REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
@@ -2187,15 +2188,16 @@ static int reiserfs_write_info(struct super_block *sb, int type)
{
struct reiserfs_transaction_handle th;
int ret, err;
+ int depth;
/* Data block + inode block */
reiserfs_write_lock(sb);
ret = journal_begin(&th, sb, 2);
if (ret)
goto out;
- reiserfs_write_unlock(sb);
+ depth = reiserfs_write_unlock_nested(sb);
ret = dquot_commit_info(sb, type);
- reiserfs_write_lock(sb);
+ reiserfs_write_lock_nested(sb, depth);
err = journal_end(&th, sb, 2);
if (!ret && err)
ret = err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c69cdd749f09..8a9e2dcfe004 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -81,8 +81,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
int error;
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
- I_MUTEX_CHILD, dir->i_sb);
+ mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
error = dir->i_op->unlink(dir, dentry);
mutex_unlock(&dentry->d_inode->i_mutex);
@@ -96,8 +95,7 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
int error;
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
- I_MUTEX_CHILD, dir->i_sb);
+ mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
error = dir->i_op->rmdir(dir, dentry);
if (!error)
dentry->d_inode->i_flags |= S_DEAD;
@@ -232,22 +230,17 @@ static int reiserfs_for_each_xattr(struct inode *inode,
if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
return 0;
- reiserfs_write_unlock(inode->i_sb);
dir = open_xa_dir(inode, XATTR_REPLACE);
if (IS_ERR(dir)) {
err = PTR_ERR(dir);
- reiserfs_write_lock(inode->i_sb);
goto out;
} else if (!dir->d_inode) {
err = 0;
- reiserfs_write_lock(inode->i_sb);
goto out_dir;
}
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
- reiserfs_write_lock(inode->i_sb);
-
buf.xadir = dir;
while (1) {
err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
@@ -281,14 +274,17 @@ static int reiserfs_for_each_xattr(struct inode *inode,
int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
struct reiserfs_transaction_handle th;
+ reiserfs_write_lock(inode->i_sb);
err = journal_begin(&th, inode->i_sb, blocks);
+ reiserfs_write_unlock(inode->i_sb);
if (!err) {
int jerror;
- reiserfs_mutex_lock_nested_safe(
- &dir->d_parent->d_inode->i_mutex,
- I_MUTEX_XATTR, inode->i_sb);
+ mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
+ I_MUTEX_XATTR);
err = action(dir, data);
+ reiserfs_write_lock(inode->i_sb);
jerror = journal_end(&th, inode->i_sb, blocks);
+ reiserfs_write_unlock(inode->i_sb);
mutex_unlock(&dir->d_parent->d_inode->i_mutex);
err = jerror ?: err;
}
@@ -455,9 +451,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
}
if (dentry->d_inode) {
- reiserfs_write_lock(inode->i_sb);
err = xattr_unlink(xadir->d_inode, dentry);
- reiserfs_write_unlock(inode->i_sb);
update_ctime(inode);
}
@@ -491,24 +485,17 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
if (get_inode_sd_version(inode) == STAT_DATA_V1)
return -EOPNOTSUPP;
- reiserfs_write_unlock(inode->i_sb);
-
if (!buffer) {
err = lookup_and_delete_xattr(inode, name);
- reiserfs_write_lock(inode->i_sb);
return err;
}
dentry = xattr_lookup(inode, name, flags);
- if (IS_ERR(dentry)) {
- reiserfs_write_lock(inode->i_sb);
+ if (IS_ERR(dentry))
return PTR_ERR(dentry);
- }
down_write(&REISERFS_I(inode)->i_xattr_sem);
- reiserfs_write_lock(inode->i_sb);
-
xahash = xattr_hash(buffer, buffer_size);
while (buffer_pos < buffer_size || buffer_pos == 0) {
size_t chunk;
@@ -538,6 +525,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
rxh->h_hash = cpu_to_le32(xahash);
}
+ reiserfs_write_lock(inode->i_sb);
err = __reiserfs_write_begin(page, page_offset, chunk + skip);
if (!err) {
if (buffer)
@@ -546,6 +534,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
page_offset + chunk +
skip);
}
+ reiserfs_write_unlock(inode->i_sb);
unlock_page(page);
reiserfs_put_page(page);
buffer_pos += chunk;
@@ -563,10 +552,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
.ia_valid = ATTR_SIZE | ATTR_CTIME,
};
- reiserfs_write_unlock(inode->i_sb);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
inode_dio_wait(dentry->d_inode);
- reiserfs_write_lock(inode->i_sb);
err = reiserfs_setattr(dentry, &newattrs);
mutex_unlock(&dentry->d_inode->i_mutex);
@@ -592,18 +579,19 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
reiserfs_write_lock(inode->i_sb);
error = journal_begin(&th, inode->i_sb, jbegin_count);
+ reiserfs_write_unlock(inode->i_sb);
if (error) {
- reiserfs_write_unlock(inode->i_sb);
return error;
}
error = reiserfs_xattr_set_handle(&th, inode, name,
buffer, buffer_size, flags);
+ reiserfs_write_lock(inode->i_sb);
error2 = journal_end(&th, inode->i_sb, jbegin_count);
+ reiserfs_write_unlock(inode->i_sb);
if (error == 0)
error = error2;
- reiserfs_write_unlock(inode->i_sb);
return error;
}
@@ -968,7 +956,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
int err = 0;
/* If we don't have the privroot located yet - go find it */
- reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
+ mutex_lock(&s->s_root->d_inode->i_mutex);
dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
strlen(PRIVROOT_NAME));
if (!IS_ERR(dentry)) {
@@ -996,14 +984,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
goto error;
if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
- reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
+ mutex_lock(&s->s_root->d_inode->i_mutex);
err = create_privroot(REISERFS_SB(s)->priv_root);
mutex_unlock(&s->s_root->d_inode->i_mutex);
}
if (privroot->d_inode) {
s->s_xattr = reiserfs_xattr_handlers;
- reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
+ mutex_lock(&privroot->d_inode->i_mutex);
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 6c8767fdfc6a..06c04f73da65 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -49,13 +49,15 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
reiserfs_write_lock(inode->i_sb);
error = journal_begin(&th, inode->i_sb, jcreate_blocks);
+ reiserfs_write_unlock(inode->i_sb);
if (error == 0) {
error = reiserfs_set_acl(&th, inode, type, acl);
+ reiserfs_write_lock(inode->i_sb);
error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
+ reiserfs_write_unlock(inode->i_sb);
if (error2)
error = error2;
}
- reiserfs_write_unlock(inode->i_sb);
release_and_out:
posix_acl_release(acl);
@@ -435,12 +437,14 @@ int reiserfs_cache_default_acl(struct inode *inode)
return nblocks;
}
+/*
+ * Called under i_mutex
+ */
int reiserfs_acl_chmod(struct inode *inode)
{
struct reiserfs_transaction_handle th;
struct posix_acl *acl;
size_t size;
- int depth;
int error;
if (IS_PRIVATE(inode))
@@ -454,9 +458,7 @@ int reiserfs_acl_chmod(struct inode *inode)
return 0;
}
- reiserfs_write_unlock(inode->i_sb);
acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
- reiserfs_write_lock(inode->i_sb);
if (!acl)
return 0;
if (IS_ERR(acl))
@@ -466,16 +468,18 @@ int reiserfs_acl_chmod(struct inode *inode)
return error;
size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count));
- depth = reiserfs_write_lock_once(inode->i_sb);
+ reiserfs_write_lock(inode->i_sb);
error = journal_begin(&th, inode->i_sb, size * 2);
+ reiserfs_write_unlock(inode->i_sb);
if (!error) {
int error2;
error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl);
+ reiserfs_write_lock(inode->i_sb);
error2 = journal_end(&th, inode->i_sb, size * 2);
+ reiserfs_write_unlock(inode->i_sb);
if (error2)
error = error2;
}
- reiserfs_write_unlock_once(inode->i_sb, depth);
posix_acl_release(acl);
return error;
}
diff --git a/fs/stat.c b/fs/stat.c
index 04ce1ac20d20..d0ea7ef75e26 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -447,9 +447,8 @@ void inode_add_bytes(struct inode *inode, loff_t bytes)
EXPORT_SYMBOL(inode_add_bytes);
-void inode_sub_bytes(struct inode *inode, loff_t bytes)
+void __inode_sub_bytes(struct inode *inode, loff_t bytes)
{
- spin_lock(&inode->i_lock);
inode->i_blocks -= bytes >> 9;
bytes &= 511;
if (inode->i_bytes < bytes) {
@@ -457,6 +456,14 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes)
inode->i_bytes += 512;
}
inode->i_bytes -= bytes;
+}
+
+EXPORT_SYMBOL(__inode_sub_bytes);
+
+void inode_sub_bytes(struct inode *inode, loff_t bytes)
+{
+ spin_lock(&inode->i_lock);
+ __inode_sub_bytes(inode, bytes);
spin_unlock(&inode->i_lock);
}
diff --git a/fs/super.c b/fs/super.c
index 7465d4364208..5536a95186e2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -152,15 +152,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
static const struct super_operations default_op;
if (s) {
- if (security_sb_alloc(s)) {
- /*
- * We cannot call security_sb_free() without
- * security_sb_alloc() succeeding. So bail out manually
- */
- kfree(s);
- s = NULL;
- goto out;
- }
+ if (security_sb_alloc(s))
+ goto out_free_sb;
+
#ifdef CONFIG_SMP
s->s_files = alloc_percpu(struct list_head);
if (!s->s_files)
@@ -228,6 +222,7 @@ err_out:
free_percpu(s->s_files);
#endif
destroy_sb_writers(s);
+out_free_sb:
kfree(s);
s = NULL;
goto out;
@@ -336,19 +331,19 @@ EXPORT_SYMBOL(deactivate_super);
* and want to turn it into a full-blown active reference. grab_super()
* is called with sb_lock held and drops it. Returns 1 in case of
* success, 0 if we had failed (superblock contents was already dead or
- * dying when grab_super() had been called).
+ * dying when grab_super() had been called). Note that this is only
+ * called for superblocks not in rundown mode (== ones still on ->fs_supers
+ * of their type), so increment of ->s_count is OK here.
*/
static int grab_super(struct super_block *s) __releases(sb_lock)
{
- if (atomic_inc_not_zero(&s->s_active)) {
- spin_unlock(&sb_lock);
- return 1;
- }
- /* it's going away */
s->s_count++;
spin_unlock(&sb_lock);
- /* wait for it to die */
down_write(&s->s_umount);
+ if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
+ put_super(s);
+ return 1;
+ }
up_write(&s->s_umount);
put_super(s);
return 0;
@@ -414,6 +409,11 @@ void generic_shutdown_super(struct super_block *sb)
evict_inodes(sb);
+ if (sb->s_dio_done_wq) {
+ destroy_workqueue(sb->s_dio_done_wq);
+ sb->s_dio_done_wq = NULL;
+ }
+
if (sop->put_super)
sop->put_super(sb);
@@ -463,11 +463,6 @@ retry:
destroy_super(s);
s = NULL;
}
- down_write(&old->s_umount);
- if (unlikely(!(old->s_flags & MS_BORN))) {
- deactivate_locked_super(old);
- goto retry;
- }
return old;
}
}
@@ -660,10 +655,10 @@ restart:
if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) {
- if (grab_super(sb)) /* drops sb_lock */
- return sb;
- else
+ if (!grab_super(sb))
goto restart;
+ up_write(&sb->s_umount);
+ return sb;
}
}
spin_unlock(&sb_lock);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 15c68f9489ae..c590cabd57bb 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -22,8 +22,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/mm.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "sysfs.h"
@@ -391,7 +390,7 @@ out_unlock:
return rc;
}
-static int open(struct inode * inode, struct file * file)
+static int open(struct inode *inode, struct file *file)
{
struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
@@ -435,7 +434,7 @@ static int open(struct inode * inode, struct file * file)
return error;
}
-static int release(struct inode * inode, struct file * file)
+static int release(struct inode *inode, struct file *file)
{
struct bin_buffer *bb = file->private_data;
@@ -481,7 +480,6 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd)
* @kobj: object.
* @attr: attribute descriptor.
*/
-
int sysfs_create_bin_file(struct kobject *kobj,
const struct bin_attribute *attr)
{
@@ -489,19 +487,16 @@ int sysfs_create_bin_file(struct kobject *kobj,
return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
}
-
+EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
/**
* sysfs_remove_bin_file - remove binary file for object.
* @kobj: object.
* @attr: attribute descriptor.
*/
-
void sysfs_remove_bin_file(struct kobject *kobj,
const struct bin_attribute *attr)
{
sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
}
-
-EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e068e744dbdd..4d83cedb9fcb 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -46,7 +46,7 @@ static unsigned int sysfs_name_hash(const void *ns, const char *name)
unsigned int len = strlen(name);
while (len--)
hash = partial_name_hash(*name++, hash);
- hash = ( end_name_hash(hash) ^ hash_ptr( (void *)ns, 31 ) );
+ hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
hash &= 0x7fffffffU;
/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
if (hash < 1)
@@ -258,7 +258,7 @@ static void sysfs_free_ino(unsigned int ino)
spin_unlock(&sysfs_ino_lock);
}
-void release_sysfs_dirent(struct sysfs_dirent * sd)
+void release_sysfs_dirent(struct sysfs_dirent *sd)
{
struct sysfs_dirent *parent_sd;
@@ -297,7 +297,6 @@ static int sysfs_dentry_delete(const struct dentry *dentry)
static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
struct sysfs_dirent *sd;
- int is_dir;
int type;
if (flags & LOOKUP_RCU)
@@ -341,18 +340,15 @@ out_bad:
* is performed at its new name the dentry will be readded
* to the dcache hashes.
*/
- is_dir = (sysfs_type(sd) == SYSFS_DIR);
mutex_unlock(&sysfs_mutex);
- if (is_dir) {
- /* If we have submounts we must allow the vfs caches
- * to lie about the state of the filesystem to prevent
- * leaks and other nasty things.
- */
- if (have_submounts(dentry))
- goto out_valid;
- shrink_dcache_parent(dentry);
- }
- d_drop(dentry);
+
+ /* If we have submounts we must allow the vfs caches
+ * to lie about the state of the filesystem to prevent
+ * leaks and other nasty things.
+ */
+ if (check_submounts_and_drop(dentry) != 0)
+ goto out_valid;
+
return 0;
}
@@ -451,7 +447,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
- sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
+ sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid",
acxt->parent_sd->s_name, sd->s_name);
return -EINVAL;
}
@@ -619,7 +615,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
if (!!sysfs_ns_type(parent_sd) != !!ns) {
WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
- sysfs_ns_type(parent_sd)? "required": "invalid",
+ sysfs_ns_type(parent_sd) ? "required" : "invalid",
parent_sd->s_name, name);
return NULL;
}
@@ -674,7 +670,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
enum kobj_ns_type type, const void *ns, const char *name,
struct sysfs_dirent **p_sd)
{
- umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+ umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
struct sysfs_addrm_cxt acxt;
struct sysfs_dirent *sd;
int rc;
@@ -735,9 +731,9 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
/**
* sysfs_create_dir - create a directory for an object.
- * @kobj: object we're creating directory for.
+ * @kobj: object we're creating directory for.
*/
-int sysfs_create_dir(struct kobject * kobj)
+int sysfs_create_dir(struct kobject *kobj)
{
enum kobj_ns_type type;
struct sysfs_dirent *parent_sd, *sd;
@@ -764,8 +760,8 @@ int sysfs_create_dir(struct kobject * kobj)
return error;
}
-static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
struct dentry *ret = NULL;
struct dentry *parent = dentry->d_parent;
@@ -857,7 +853,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
* what used to be sysfs_rmdir() below, instead of calling separately.
*/
-void sysfs_remove_dir(struct kobject * kobj)
+void sysfs_remove_dir(struct kobject *kobj)
{
struct sysfs_dirent *sd = kobj->sd;
@@ -896,7 +892,9 @@ int sysfs_rename(struct sysfs_dirent *sd,
sd->s_name = new_name;
}
- /* Move to the appropriate place in the appropriate directories rbtree. */
+ /*
+ * Move to the appropriate place in the appropriate directories rbtree.
+ */
sysfs_unlink_sibling(sd);
sysfs_get(new_parent_sd);
sysfs_put(sd->s_parent);
@@ -988,20 +986,21 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
{
pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
- if (pos) do {
- struct rb_node *node = rb_next(&pos->s_rb);
- if (!node)
- pos = NULL;
- else
- pos = to_sysfs_dirent(node);
- } while (pos && pos->s_ns != ns);
+ if (pos)
+ do {
+ struct rb_node *node = rb_next(&pos->s_rb);
+ if (!node)
+ pos = NULL;
+ else
+ pos = to_sysfs_dirent(node);
+ } while (pos && pos->s_ns != ns);
return pos;
}
static int sysfs_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
- struct sysfs_dirent * parent_sd = dentry->d_fsdata;
+ struct sysfs_dirent *parent_sd = dentry->d_fsdata;
struct sysfs_dirent *pos = file->private_data;
enum kobj_ns_type type;
const void *ns;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d2bb7ed8fa74..15ef5eb13663 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -20,7 +20,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/limits.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "sysfs.h"
@@ -45,8 +45,8 @@ struct sysfs_open_dirent {
struct sysfs_buffer {
size_t count;
loff_t pos;
- char * page;
- const struct sysfs_ops * ops;
+ char *page;
+ const struct sysfs_ops *ops;
struct mutex mutex;
int needs_read_fill;
int event;
@@ -59,16 +59,16 @@ struct sysfs_buffer {
* @buffer: data buffer for file.
*
* Allocate @buffer->page, if it hasn't been already, then call the
- * kobject's show() method to fill the buffer with this attribute's
- * data.
+ * kobject's show() method to fill the buffer with this attribute's
+ * data.
* This is called only once, on the file's first read unless an error
* is returned.
*/
-static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
+static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
{
struct sysfs_dirent *attr_sd = dentry->d_fsdata;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- const struct sysfs_ops * ops = buffer->ops;
+ const struct sysfs_ops *ops = buffer->ops;
int ret = 0;
ssize_t count;
@@ -106,7 +106,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
}
/**
- * sysfs_read_file - read an attribute.
+ * sysfs_read_file - read an attribute.
* @file: file pointer.
* @buf: buffer to fill.
* @count: number of bytes to read.
@@ -127,12 +127,12 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
static ssize_t
sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- struct sysfs_buffer * buffer = file->private_data;
+ struct sysfs_buffer *buffer = file->private_data;
ssize_t retval = 0;
mutex_lock(&buffer->mutex);
if (buffer->needs_read_fill || *ppos == 0) {
- retval = fill_read_buffer(file->f_path.dentry,buffer);
+ retval = fill_read_buffer(file->f_path.dentry, buffer);
if (retval)
goto out;
}
@@ -154,9 +154,8 @@ out:
* Allocate @buffer->page if it hasn't been already, then
* copy the user-supplied buffer into it.
*/
-
-static int
-fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t count)
+static int fill_write_buffer(struct sysfs_buffer *buffer,
+ const char __user *buf, size_t count)
{
int error;
@@ -167,7 +166,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t
if (count >= PAGE_SIZE)
count = PAGE_SIZE - 1;
- error = copy_from_user(buffer->page,buf,count);
+ error = copy_from_user(buffer->page, buf, count);
buffer->needs_read_fill = 1;
/* if buf is assumed to contain a string, terminate it by \0,
so e.g. sscanf() can scan the string easily */
@@ -183,16 +182,15 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t
* @count: number of bytes
*
* Get the correct pointers for the kobject and the attribute we're
- * dealing with, then call the store() method for the attribute,
+ * dealing with, then call the store() method for the attribute,
* passing the buffer that we acquired in fill_write_buffer().
*/
-
-static int
-flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count)
+static int flush_write_buffer(struct dentry *dentry,
+ struct sysfs_buffer *buffer, size_t count)
{
struct sysfs_dirent *attr_sd = dentry->d_fsdata;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- const struct sysfs_ops * ops = buffer->ops;
+ const struct sysfs_ops *ops = buffer->ops;
int rc;
/* need attr_sd for attr and ops, its parent for kobj */
@@ -219,15 +217,14 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
* then push it to the kobject in flush_write_buffer().
* There is no easy way for us to know if userspace is only doing a partial
* write, so we don't support them. We expect the entire buffer to come
- * on the first write.
+ * on the first write.
* Hint: if you're writing a value, first read the file, modify only the
- * the value you're changing, then write entire buffer back.
+ * the value you're changing, then write entire buffer back.
*/
-
-static ssize_t
-sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
{
- struct sysfs_buffer * buffer = file->private_data;
+ struct sysfs_buffer *buffer = file->private_data;
ssize_t len;
mutex_lock(&buffer->mutex);
@@ -339,13 +336,14 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
if (kobj->ktype && kobj->ktype->sysfs_ops)
ops = kobj->ktype->sysfs_ops;
else {
- WARN(1, KERN_ERR "missing sysfs attribute operations for "
- "kobject: %s\n", kobject_name(kobj));
+ WARN(1, KERN_ERR
+ "missing sysfs attribute operations for kobject: %s\n",
+ kobject_name(kobj));
goto err_out;
}
/* File needs write support.
- * The inode's perms must say it's ok,
+ * The inode's perms must say it's ok,
* and we must have a store method.
*/
if (file->f_mode & FMODE_WRITE) {
@@ -420,7 +418,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
*/
static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
{
- struct sysfs_buffer * buffer = filp->private_data;
+ struct sysfs_buffer *buffer = filp->private_data;
struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
struct sysfs_open_dirent *od = attr_sd->s_attr.open;
@@ -518,8 +516,9 @@ static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
ns = ops->namespace(kobj, attr);
out:
if (err) {
- WARN(1, KERN_ERR "missing sysfs namespace attribute operation for "
- "kobject: %s\n", kobject_name(kobj));
+ WARN(1, KERN_ERR
+ "missing sysfs namespace attribute operation for kobject: %s\n",
+ kobject_name(kobj));
}
*pns = ns;
return err;
@@ -566,17 +565,17 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
/**
* sysfs_create_file - create an attribute file for an object.
- * @kobj: object we're creating for.
+ * @kobj: object we're creating for.
* @attr: attribute descriptor.
*/
-
-int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
+int sysfs_create_file(struct kobject *kobj, const struct attribute *attr)
{
BUG_ON(!kobj || !kobj->sd || !attr);
return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
}
+EXPORT_SYMBOL_GPL(sysfs_create_file);
int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
{
@@ -590,6 +589,7 @@ int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
sysfs_remove_file(kobj, ptr[i]);
return err;
}
+EXPORT_SYMBOL_GPL(sysfs_create_files);
/**
* sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -654,7 +654,6 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
}
EXPORT_SYMBOL_GPL(sysfs_chmod_file);
-
/**
* sysfs_remove_file - remove an object attribute.
* @kobj: object we're acting for.
@@ -662,8 +661,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
*
* Hash the attribute name and kill the victim.
*/
-
-void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
+void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr)
{
const void *ns;
@@ -672,13 +670,15 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
sysfs_hash_and_remove(kobj->sd, ns, attr->name);
}
+EXPORT_SYMBOL_GPL(sysfs_remove_file);
-void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
+void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
{
int i;
for (i = 0; ptr[i]; i++)
sysfs_remove_file(kobj, ptr[i]);
}
+EXPORT_SYMBOL_GPL(sysfs_remove_files);
/**
* sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -793,9 +793,3 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
return 0;
}
EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
-
-
-EXPORT_SYMBOL_GPL(sysfs_create_file);
-EXPORT_SYMBOL_GPL(sysfs_remove_file);
-EXPORT_SYMBOL_GPL(sysfs_remove_files);
-EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aec3d5c98c94..5f92cd2f61c1 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -3,8 +3,10 @@
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2013 Greg Kroah-Hartman
+ * Copyright (c) 2013 The Linux Foundation
*
- * This file is released undert the GPL v2.
+ * This file is released undert the GPL v2.
*
*/
@@ -19,39 +21,65 @@
static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
const struct attribute_group *grp)
{
- struct attribute *const* attr;
- int i;
+ struct attribute *const *attr;
+ struct bin_attribute *const *bin_attr;
- for (i = 0, attr = grp->attrs; *attr; i++, attr++)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ if (grp->attrs)
+ for (attr = grp->attrs; *attr; attr++)
+ sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ if (grp->bin_attrs)
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
+ sysfs_remove_bin_file(kobj, *bin_attr);
}
static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
const struct attribute_group *grp, int update)
{
- struct attribute *const* attr;
+ struct attribute *const *attr;
+ struct bin_attribute *const *bin_attr;
int error = 0, i;
- for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
- umode_t mode = 0;
+ if (grp->attrs) {
+ for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+ umode_t mode = 0;
+
+ /*
+ * In update mode, we're changing the permissions or
+ * visibility. Do this by first removing then
+ * re-adding (if required) the file.
+ */
+ if (update)
+ sysfs_hash_and_remove(dir_sd, NULL,
+ (*attr)->name);
+ if (grp->is_visible) {
+ mode = grp->is_visible(kobj, *attr, i);
+ if (!mode)
+ continue;
+ }
+ error = sysfs_add_file_mode(dir_sd, *attr,
+ SYSFS_KOBJ_ATTR,
+ (*attr)->mode | mode);
+ if (unlikely(error))
+ break;
+ }
+ if (error) {
+ remove_files(dir_sd, kobj, grp);
+ goto exit;
+ }
+ }
- /* in update mode, we're changing the permissions or
- * visibility. Do this by first removing then
- * re-adding (if required) the file */
- if (update)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
- if (grp->is_visible) {
- mode = grp->is_visible(kobj, *attr, i);
- if (!mode)
- continue;
+ if (grp->bin_attrs) {
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ if (update)
+ sysfs_remove_bin_file(kobj, *bin_attr);
+ error = sysfs_create_bin_file(kobj, *bin_attr);
+ if (error)
+ break;
}
- error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR,
- (*attr)->mode | mode);
- if (unlikely(error))
- break;
+ if (error)
+ remove_files(dir_sd, kobj, grp);
}
- if (error)
- remove_files(dir_sd, kobj, grp);
+exit:
return error;
}
@@ -67,8 +95,8 @@ static int internal_create_group(struct kobject *kobj, int update,
/* Updates may happen before the object has been instantiated */
if (unlikely(update && !kobj->sd))
return -EINVAL;
- if (!grp->attrs) {
- WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n",
+ if (!grp->attrs && !grp->bin_attrs) {
+ WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
kobj->name, grp->name ? "" : grp->name);
return -EINVAL;
}
@@ -103,6 +131,41 @@ int sysfs_create_group(struct kobject *kobj,
{
return internal_create_group(kobj, 0, grp);
}
+EXPORT_SYMBOL_GPL(sysfs_create_group);
+
+/**
+ * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
+ * @kobj: The kobject to create the group on
+ * @groups: The attribute groups to create, NULL terminated
+ *
+ * This function creates a bunch of attribute groups. If an error occurs when
+ * creating a group, all previously created groups will be removed, unwinding
+ * everything back to the original state when this function was called.
+ * It will explicitly warn and error if any of the attribute files being
+ * created already exist.
+ *
+ * Returns 0 on success or error code from sysfs_create_group on error.
+ */
+int sysfs_create_groups(struct kobject *kobj,
+ const struct attribute_group **groups)
+{
+ int error = 0;
+ int i;
+
+ if (!groups)
+ return 0;
+
+ for (i = 0; groups[i]; i++) {
+ error = sysfs_create_group(kobj, groups[i]);
+ if (error) {
+ while (--i >= 0)
+ sysfs_remove_group(kobj, groups[i]);
+ break;
+ }
+ }
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_create_groups);
/**
* sysfs_update_group - given a directory kobject, update an attribute group
@@ -126,11 +189,18 @@ int sysfs_update_group(struct kobject *kobj,
{
return internal_create_group(kobj, 1, grp);
}
+EXPORT_SYMBOL_GPL(sysfs_update_group);
-
-
-void sysfs_remove_group(struct kobject * kobj,
- const struct attribute_group * grp)
+/**
+ * sysfs_remove_group: remove a group from a kobject
+ * @kobj: kobject to remove the group from
+ * @grp: group to remove
+ *
+ * This function removes a group of attributes from a kobject. The attributes
+ * previously have to have been created for this group, otherwise it will fail.
+ */
+void sysfs_remove_group(struct kobject *kobj,
+ const struct attribute_group *grp)
{
struct sysfs_dirent *dir_sd = kobj->sd;
struct sysfs_dirent *sd;
@@ -138,8 +208,9 @@ void sysfs_remove_group(struct kobject * kobj,
if (grp->name) {
sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
if (!sd) {
- WARN(!sd, KERN_WARNING "sysfs group %p not found for "
- "kobject '%s'\n", grp, kobject_name(kobj));
+ WARN(!sd, KERN_WARNING
+ "sysfs group %p not found for kobject '%s'\n",
+ grp, kobject_name(kobj));
return;
}
} else
@@ -151,6 +222,27 @@ void sysfs_remove_group(struct kobject * kobj,
sysfs_put(sd);
}
+EXPORT_SYMBOL_GPL(sysfs_remove_group);
+
+/**
+ * sysfs_remove_groups - remove a list of groups
+ *
+ * @kobj: The kobject for the groups to be removed from
+ * @groups: NULL terminated list of groups to be removed
+ *
+ * If groups is not NULL, remove the specified groups from the kobject.
+ */
+void sysfs_remove_groups(struct kobject *kobj,
+ const struct attribute_group **groups)
+{
+ int i;
+
+ if (!groups)
+ return;
+ for (i = 0; groups[i]; i++)
+ sysfs_remove_group(kobj, groups[i]);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_groups);
/**
* sysfs_merge_group - merge files into a pre-existing attribute group.
@@ -247,7 +339,3 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
}
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
-
-EXPORT_SYMBOL_GPL(sysfs_create_group);
-EXPORT_SYMBOL_GPL(sysfs_update_group);
-EXPORT_SYMBOL_GPL(sysfs_remove_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 3e2837a633ed..963f910c8034 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -10,7 +10,7 @@
* Please see Documentation/filesystems/sysfs.txt for more information.
*/
-#undef DEBUG
+#undef DEBUG
#include <linux/pagemap.h>
#include <linux/namei.h>
@@ -36,7 +36,7 @@ static struct backing_dev_info sysfs_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-static const struct inode_operations sysfs_inode_operations ={
+static const struct inode_operations sysfs_inode_operations = {
.permission = sysfs_permission,
.setattr = sysfs_setattr,
.getattr = sysfs_getattr,
@@ -67,7 +67,7 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
return attrs;
}
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
+int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
{
struct sysfs_inode_attrs *sd_attrs;
struct iattr *iattrs;
@@ -128,7 +128,8 @@ out:
return error;
}
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len)
+static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
+ u32 *secdata_len)
{
struct sysfs_inode_attrs *iattrs;
void *old_secdata;
@@ -186,13 +187,13 @@ out:
return error;
}
-static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
-static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
{
inode->i_uid = iattr->ia_uid;
inode->i_gid = iattr->ia_gid;
@@ -220,7 +221,8 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
set_nlink(inode, sd->s_dir.subdirs + 2);
}
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
{
struct sysfs_dirent *sd = dentry->d_fsdata;
struct inode *inode = dentry->d_inode;
@@ -285,7 +287,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
* RETURNS:
* Pointer to allocated inode on success, NULL on failure.
*/
-struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
{
struct inode *inode;
@@ -312,7 +314,8 @@ void sysfs_evict_inode(struct inode *inode)
sysfs_put(sd);
}
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
+ const char *name)
{
struct sysfs_addrm_cxt acxt;
struct sysfs_dirent *sd;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index afd83273e6ce..834ec2cdb7a3 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -64,7 +64,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
/* instantiate and link root dentry */
root = d_make_root(inode);
if (!root) {
- pr_debug("%s: could not get root dentry!\n",__func__);
+ pr_debug("%s: could not get root dentry!\n", __func__);
return -ENOMEM;
}
root->d_fsdata = &sysfs_root;
@@ -112,8 +112,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
struct super_block *sb;
int error;
- if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs)
- return ERR_PTR(-EPERM);
+ if (!(flags & MS_KERNMOUNT)) {
+ if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
+ return ERR_PTR(-EPERM);
+
+ for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+ if (!kobj_ns_current_may_mount(type))
+ return ERR_PTR(-EPERM);
+ }
+ }
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 8c940df97a52..2dd4507d9edd 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -125,6 +125,7 @@ int sysfs_create_link(struct kobject *kobj, struct kobject *target,
{
return sysfs_do_create_link(kobj, target, name, 1);
}
+EXPORT_SYMBOL_GPL(sysfs_create_link);
/**
* sysfs_create_link_nowarn - create symlink between two objects.
@@ -166,8 +167,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
* @kobj: object we're acting for.
* @name: name of the symlink to remove.
*/
-
-void sysfs_remove_link(struct kobject * kobj, const char * name)
+void sysfs_remove_link(struct kobject *kobj, const char *name)
{
struct sysfs_dirent *parent_sd = NULL;
@@ -178,6 +178,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
sysfs_hash_and_remove(parent_sd, NULL, name);
}
+EXPORT_SYMBOL_GPL(sysfs_remove_link);
/**
* sysfs_rename_link - rename symlink in object's directory.
@@ -223,6 +224,7 @@ out:
sysfs_put(sd);
return result;
}
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
struct sysfs_dirent *target_sd, char *path)
@@ -276,7 +278,7 @@ static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
return 0;
}
-static int sysfs_getlink(struct dentry *dentry, char * path)
+static int sysfs_getlink(struct dentry *dentry, char *path)
{
struct sysfs_dirent *sd = dentry->d_fsdata;
struct sysfs_dirent *parent_sd = sd->s_parent;
@@ -295,7 +297,7 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
int error = -ENOMEM;
unsigned long page = get_zeroed_page(GFP_KERNEL);
if (page) {
- error = sysfs_getlink(dentry, (char *) page);
+ error = sysfs_getlink(dentry, (char *) page);
if (error < 0)
free_page((unsigned long)page);
}
@@ -303,7 +305,8 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
return NULL;
}
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
+ void *cookie)
{
char *page = nd_get_link(nd);
if (!IS_ERR(page))
@@ -319,8 +322,3 @@ const struct inode_operations sysfs_symlink_inode_operations = {
.getattr = sysfs_getattr,
.permission = sysfs_permission,
};
-
-
-EXPORT_SYMBOL_GPL(sysfs_create_link);
-EXPORT_SYMBOL_GPL(sysfs_remove_link);
-EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d1e4043eb0c3..b6deca3e301d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -78,7 +78,7 @@ struct sysfs_dirent {
};
unsigned short s_flags;
- umode_t s_mode;
+ umode_t s_mode;
unsigned int s_ino;
struct sysfs_inode_attrs *s_iattr;
};
@@ -123,9 +123,9 @@ do { \
key = &attr->skey; \
\
lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
-} while(0)
+} while (0)
#else
-#define sysfs_dirent_init_lockdep(sd) do {} while(0)
+#define sysfs_dirent_init_lockdep(sd) do {} while (0)
#endif
/*
@@ -186,8 +186,8 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
struct sysfs_dirent **p_sd);
void sysfs_remove_subdir(struct sysfs_dirent *sd);
-int sysfs_rename(struct sysfs_dirent *sd,
- struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
+int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
+ const void *ns, const char *new_name);
static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
{
@@ -214,10 +214,12 @@ void sysfs_evict_inode(struct inode *inode);
int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
int sysfs_permission(struct inode *inode, int mask);
int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
+ size_t size, int flags);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
+ const char *name);
int sysfs_inode_init(void);
/*
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9ac4057a86c9..839a2bad7f45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -630,6 +630,12 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
struct udf_sb_info *sbi = UDF_SB(sb);
int error = 0;
+ if (sbi->s_lvid_bh) {
+ int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
+ if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
+ return -EACCES;
+ }
+
uopt.flags = sbi->s_flags;
uopt.uid = sbi->s_uid;
uopt.gid = sbi->s_gid;
@@ -649,12 +655,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
sbi->s_dmode = uopt.dmode;
write_unlock(&sbi->s_cred_lock);
- if (sbi->s_lvid_bh) {
- int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
- if (write_rev > UDF_MAX_WRITE_VERSION)
- *flags |= MS_RDONLY;
- }
-
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
goto out_unlock;
@@ -843,27 +843,38 @@ static int udf_find_fileset(struct super_block *sb,
return 1;
}
+/*
+ * Load primary Volume Descriptor Sequence
+ *
+ * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence
+ * should be tried.
+ */
static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
{
struct primaryVolDesc *pvoldesc;
struct ustr *instr, *outstr;
struct buffer_head *bh;
uint16_t ident;
- int ret = 1;
+ int ret = -ENOMEM;
instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
if (!instr)
- return 1;
+ return -ENOMEM;
outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
if (!outstr)
goto out1;
bh = udf_read_tagged(sb, block, block, &ident);
- if (!bh)
+ if (!bh) {
+ ret = -EAGAIN;
goto out2;
+ }
- BUG_ON(ident != TAG_IDENT_PVD);
+ if (ident != TAG_IDENT_PVD) {
+ ret = -EIO;
+ goto out_bh;
+ }
pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -889,8 +900,9 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
if (udf_CS0toUTF8(outstr, instr))
udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
- brelse(bh);
ret = 0;
+out_bh:
+ brelse(bh);
out2:
kfree(outstr);
out1:
@@ -947,7 +959,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
if (mdata->s_mirror_fe == NULL) {
udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
- goto error_exit;
+ return -EIO;
}
}
@@ -964,23 +976,18 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
addr.logicalBlockNum, addr.partitionReferenceNum);
mdata->s_bitmap_fe = udf_iget(sb, &addr);
-
if (mdata->s_bitmap_fe == NULL) {
if (sb->s_flags & MS_RDONLY)
udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
else {
udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
- goto error_exit;
+ return -EIO;
}
}
}
udf_debug("udf_load_metadata_files Ok\n");
-
return 0;
-
-error_exit:
- return 1;
}
static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
@@ -1069,7 +1076,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (!map->s_uspace.s_table) {
udf_debug("cannot load unallocSpaceTable (part %d)\n",
p_index);
- return 1;
+ return -EIO;
}
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
udf_debug("unallocSpaceTable (part %d) @ %ld\n",
@@ -1079,7 +1086,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (phd->unallocSpaceBitmap.extLength) {
struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
if (!bitmap)
- return 1;
+ return -ENOMEM;
map->s_uspace.s_bitmap = bitmap;
bitmap->s_extPosition = le32_to_cpu(
phd->unallocSpaceBitmap.extPosition);
@@ -1102,7 +1109,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (!map->s_fspace.s_table) {
udf_debug("cannot load freedSpaceTable (part %d)\n",
p_index);
- return 1;
+ return -EIO;
}
map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
@@ -1113,7 +1120,7 @@ static int udf_fill_partdesc_info(struct super_block *sb,
if (phd->freedSpaceBitmap.extLength) {
struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
if (!bitmap)
- return 1;
+ return -ENOMEM;
map->s_fspace.s_bitmap = bitmap;
bitmap->s_extPosition = le32_to_cpu(
phd->freedSpaceBitmap.extPosition);
@@ -1165,7 +1172,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
}
if (!sbi->s_vat_inode)
- return 1;
+ return -EIO;
if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
map->s_type_specific.s_virtual.s_start_offset = 0;
@@ -1177,7 +1184,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
pos = udf_block_map(sbi->s_vat_inode, 0);
bh = sb_bread(sb, pos);
if (!bh)
- return 1;
+ return -EIO;
vat20 = (struct virtualAllocationTable20 *)bh->b_data;
} else {
vat20 = (struct virtualAllocationTable20 *)
@@ -1195,6 +1202,12 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
return 0;
}
+/*
+ * Load partition descriptor block
+ *
+ * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor
+ * sequence.
+ */
static int udf_load_partdesc(struct super_block *sb, sector_t block)
{
struct buffer_head *bh;
@@ -1204,13 +1217,15 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
int i, type1_idx;
uint16_t partitionNumber;
uint16_t ident;
- int ret = 0;
+ int ret;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
- return 1;
- if (ident != TAG_IDENT_PD)
+ return -EAGAIN;
+ if (ident != TAG_IDENT_PD) {
+ ret = 0;
goto out_bh;
+ }
p = (struct partitionDesc *)bh->b_data;
partitionNumber = le16_to_cpu(p->partitionNumber);
@@ -1229,10 +1244,13 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
if (i >= sbi->s_partitions) {
udf_debug("Partition (%d) not found in partition map\n",
partitionNumber);
+ ret = 0;
goto out_bh;
}
ret = udf_fill_partdesc_info(sb, p, i);
+ if (ret < 0)
+ goto out_bh;
/*
* Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
@@ -1249,32 +1267,37 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
break;
}
- if (i >= sbi->s_partitions)
+ if (i >= sbi->s_partitions) {
+ ret = 0;
goto out_bh;
+ }
ret = udf_fill_partdesc_info(sb, p, i);
- if (ret)
+ if (ret < 0)
goto out_bh;
if (map->s_partition_type == UDF_METADATA_MAP25) {
ret = udf_load_metadata_files(sb, i);
- if (ret) {
+ if (ret < 0) {
udf_err(sb, "error loading MetaData partition map %d\n",
i);
goto out_bh;
}
} else {
- ret = udf_load_vat(sb, i, type1_idx);
- if (ret)
- goto out_bh;
/*
- * Mark filesystem read-only if we have a partition with
- * virtual map since we don't handle writing to it (we
- * overwrite blocks instead of relocating them).
+ * If we have a partition with virtual map, we don't handle
+ * writing to it (we overwrite blocks instead of relocating
+ * them).
*/
- sb->s_flags |= MS_RDONLY;
- pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n");
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ret = -EACCES;
+ goto out_bh;
+ }
+ ret = udf_load_vat(sb, i, type1_idx);
+ if (ret < 0)
+ goto out_bh;
}
+ ret = 0;
out_bh:
/* In case loading failed, we handle cleanup in udf_fill_super */
brelse(bh);
@@ -1340,11 +1363,11 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
uint16_t ident;
struct buffer_head *bh;
unsigned int table_len;
- int ret = 0;
+ int ret;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
- return 1;
+ return -EAGAIN;
BUG_ON(ident != TAG_IDENT_LVD);
lvd = (struct logicalVolDesc *)bh->b_data;
table_len = le32_to_cpu(lvd->mapTableLength);
@@ -1352,7 +1375,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
udf_err(sb, "error loading logical volume descriptor: "
"Partition table too long (%u > %lu)\n", table_len,
sb->s_blocksize - sizeof(*lvd));
- ret = 1;
+ ret = -EIO;
goto out_bh;
}
@@ -1396,11 +1419,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
} else if (!strncmp(upm2->partIdent.ident,
UDF_ID_SPARABLE,
strlen(UDF_ID_SPARABLE))) {
- if (udf_load_sparable_map(sb, map,
- (struct sparablePartitionMap *)gpm) < 0) {
- ret = 1;
+ ret = udf_load_sparable_map(sb, map,
+ (struct sparablePartitionMap *)gpm);
+ if (ret < 0)
goto out_bh;
- }
} else if (!strncmp(upm2->partIdent.ident,
UDF_ID_METADATA,
strlen(UDF_ID_METADATA))) {
@@ -1465,7 +1487,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
}
if (lvd->integritySeqExt.extLength)
udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
-
+ ret = 0;
out_bh:
brelse(bh);
return ret;
@@ -1503,22 +1525,18 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
}
/*
- * udf_process_sequence
- *
- * PURPOSE
- * Process a main/reserve volume descriptor sequence.
- *
- * PRE-CONDITIONS
- * sb Pointer to _locked_ superblock.
- * block First block of first extent of the sequence.
- * lastblock Lastblock of first extent of the sequence.
+ * Process a main/reserve volume descriptor sequence.
+ * @block First block of first extent of the sequence.
+ * @lastblock Lastblock of first extent of the sequence.
+ * @fileset There we store extent containing root fileset
*
- * HISTORY
- * July 1, 1997 - Andrew E. Mileski
- * Written, tested, and released.
+ * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor
+ * sequence
*/
-static noinline int udf_process_sequence(struct super_block *sb, long block,
- long lastblock, struct kernel_lb_addr *fileset)
+static noinline int udf_process_sequence(
+ struct super_block *sb,
+ sector_t block, sector_t lastblock,
+ struct kernel_lb_addr *fileset)
{
struct buffer_head *bh = NULL;
struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1529,6 +1547,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
uint32_t vdsn;
uint16_t ident;
long next_s = 0, next_e = 0;
+ int ret;
memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
@@ -1543,7 +1562,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
udf_err(sb,
"Block %llu of volume descriptor sequence is corrupted or we could not read it\n",
(unsigned long long)block);
- return 1;
+ return -EAGAIN;
}
/* Process each descriptor (ISO 13346 3/8.3-8.4) */
@@ -1616,14 +1635,19 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
*/
if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
udf_err(sb, "Primary Volume Descriptor not found!\n");
- return 1;
+ return -EAGAIN;
+ }
+ ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block);
+ if (ret < 0)
+ return ret;
+
+ if (vds[VDS_POS_LOGICAL_VOL_DESC].block) {
+ ret = udf_load_logicalvol(sb,
+ vds[VDS_POS_LOGICAL_VOL_DESC].block,
+ fileset);
+ if (ret < 0)
+ return ret;
}
- if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
- return 1;
-
- if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb,
- vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset))
- return 1;
if (vds[VDS_POS_PARTITION_DESC].block) {
/*
@@ -1632,19 +1656,27 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
*/
for (block = vds[VDS_POS_PARTITION_DESC].block;
block < vds[VDS_POS_TERMINATING_DESC].block;
- block++)
- if (udf_load_partdesc(sb, block))
- return 1;
+ block++) {
+ ret = udf_load_partdesc(sb, block);
+ if (ret < 0)
+ return ret;
+ }
}
return 0;
}
+/*
+ * Load Volume Descriptor Sequence described by anchor in bh
+ *
+ * Returns <0 on error, 0 on success
+ */
static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
struct kernel_lb_addr *fileset)
{
struct anchorVolDescPtr *anchor;
- long main_s, main_e, reserve_s, reserve_e;
+ sector_t main_s, main_e, reserve_s, reserve_e;
+ int ret;
anchor = (struct anchorVolDescPtr *)bh->b_data;
@@ -1662,18 +1694,26 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
/* Process the main & reserve sequences */
/* responsible for finding the PartitionDesc(s) */
- if (!udf_process_sequence(sb, main_s, main_e, fileset))
- return 1;
- udf_sb_free_partitions(sb);
- if (!udf_process_sequence(sb, reserve_s, reserve_e, fileset))
- return 1;
+ ret = udf_process_sequence(sb, main_s, main_e, fileset);
+ if (ret != -EAGAIN)
+ return ret;
udf_sb_free_partitions(sb);
- return 0;
+ ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+ if (ret < 0) {
+ udf_sb_free_partitions(sb);
+ /* No sequence was OK, return -EIO */
+ if (ret == -EAGAIN)
+ ret = -EIO;
+ }
+ return ret;
}
/*
* Check whether there is an anchor block in the given block and
* load Volume Descriptor Sequence if so.
+ *
+ * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor
+ * block
*/
static int udf_check_anchor_block(struct super_block *sb, sector_t block,
struct kernel_lb_addr *fileset)
@@ -1685,33 +1725,40 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
udf_fixed_to_variable(block) >=
sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
- return 0;
+ return -EAGAIN;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
- return 0;
+ return -EAGAIN;
if (ident != TAG_IDENT_AVDP) {
brelse(bh);
- return 0;
+ return -EAGAIN;
}
ret = udf_load_sequence(sb, bh, fileset);
brelse(bh);
return ret;
}
-/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
- struct kernel_lb_addr *fileset)
+/*
+ * Search for an anchor volume descriptor pointer.
+ *
+ * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set
+ * of anchors.
+ */
+static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
+ struct kernel_lb_addr *fileset)
{
sector_t last[6];
int i;
struct udf_sb_info *sbi = UDF_SB(sb);
int last_count = 0;
+ int ret;
/* First try user provided anchor */
if (sbi->s_anchor) {
- if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
- return lastblock;
+ ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset);
+ if (ret != -EAGAIN)
+ return ret;
}
/*
* according to spec, anchor is in either:
@@ -1720,39 +1767,46 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
* lastblock
* however, if the disc isn't closed, it could be 512.
*/
- if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
- return lastblock;
+ ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset);
+ if (ret != -EAGAIN)
+ return ret;
/*
* The trouble is which block is the last one. Drives often misreport
* this so we try various possibilities.
*/
- last[last_count++] = lastblock;
- if (lastblock >= 1)
- last[last_count++] = lastblock - 1;
- last[last_count++] = lastblock + 1;
- if (lastblock >= 2)
- last[last_count++] = lastblock - 2;
- if (lastblock >= 150)
- last[last_count++] = lastblock - 150;
- if (lastblock >= 152)
- last[last_count++] = lastblock - 152;
+ last[last_count++] = *lastblock;
+ if (*lastblock >= 1)
+ last[last_count++] = *lastblock - 1;
+ last[last_count++] = *lastblock + 1;
+ if (*lastblock >= 2)
+ last[last_count++] = *lastblock - 2;
+ if (*lastblock >= 150)
+ last[last_count++] = *lastblock - 150;
+ if (*lastblock >= 152)
+ last[last_count++] = *lastblock - 152;
for (i = 0; i < last_count; i++) {
if (last[i] >= sb->s_bdev->bd_inode->i_size >>
sb->s_blocksize_bits)
continue;
- if (udf_check_anchor_block(sb, last[i], fileset))
- return last[i];
+ ret = udf_check_anchor_block(sb, last[i], fileset);
+ if (ret != -EAGAIN) {
+ if (!ret)
+ *lastblock = last[i];
+ return ret;
+ }
if (last[i] < 256)
continue;
- if (udf_check_anchor_block(sb, last[i] - 256, fileset))
- return last[i];
+ ret = udf_check_anchor_block(sb, last[i] - 256, fileset);
+ if (ret != -EAGAIN) {
+ if (!ret)
+ *lastblock = last[i];
+ return ret;
+ }
}
/* Finally try block 512 in case media is open */
- if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
- return last[0];
- return 0;
+ return udf_check_anchor_block(sb, sbi->s_session + 512, fileset);
}
/*
@@ -1760,54 +1814,59 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
* area specified by it. The function expects sbi->s_lastblock to be the last
* block on the media.
*
- * Return 1 if ok, 0 if not found.
- *
+ * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor
+ * was not found.
*/
static int udf_find_anchor(struct super_block *sb,
struct kernel_lb_addr *fileset)
{
- sector_t lastblock;
struct udf_sb_info *sbi = UDF_SB(sb);
+ sector_t lastblock = sbi->s_last_block;
+ int ret;
- lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
- if (lastblock)
+ ret = udf_scan_anchors(sb, &lastblock, fileset);
+ if (ret != -EAGAIN)
goto out;
/* No anchor found? Try VARCONV conversion of block numbers */
UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ lastblock = udf_variable_to_fixed(sbi->s_last_block);
/* Firstly, we try to not convert number of the last block */
- lastblock = udf_scan_anchors(sb,
- udf_variable_to_fixed(sbi->s_last_block),
- fileset);
- if (lastblock)
+ ret = udf_scan_anchors(sb, &lastblock, fileset);
+ if (ret != -EAGAIN)
goto out;
+ lastblock = sbi->s_last_block;
/* Secondly, we try with converted number of the last block */
- lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
- if (!lastblock) {
+ ret = udf_scan_anchors(sb, &lastblock, fileset);
+ if (ret < 0) {
/* VARCONV didn't help. Clear it. */
UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
- return 0;
}
out:
- sbi->s_last_block = lastblock;
- return 1;
+ if (ret == 0)
+ sbi->s_last_block = lastblock;
+ return ret;
}
/*
* Check Volume Structure Descriptor, find Anchor block and load Volume
- * Descriptor Sequence
+ * Descriptor Sequence.
+ *
+ * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor
+ * block was not found.
*/
static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
int silent, struct kernel_lb_addr *fileset)
{
struct udf_sb_info *sbi = UDF_SB(sb);
loff_t nsr_off;
+ int ret;
if (!sb_set_blocksize(sb, uopt->blocksize)) {
if (!silent)
udf_warn(sb, "Bad block size\n");
- return 0;
+ return -EINVAL;
}
sbi->s_last_block = uopt->lastblock;
if (!uopt->novrs) {
@@ -1828,12 +1887,13 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
/* Look for anchor block and load Volume Descriptor Sequence */
sbi->s_anchor = uopt->anchor;
- if (!udf_find_anchor(sb, fileset)) {
- if (!silent)
+ ret = udf_find_anchor(sb, fileset);
+ if (ret < 0) {
+ if (!silent && ret == -EAGAIN)
udf_warn(sb, "No anchor found\n");
- return 0;
+ return ret;
}
- return 1;
+ return 0;
}
static void udf_open_lvid(struct super_block *sb)
@@ -1939,7 +1999,7 @@ u64 lvid_get_unique_id(struct super_block *sb)
static int udf_fill_super(struct super_block *sb, void *options, int silent)
{
- int ret;
+ int ret = -EINVAL;
struct inode *inode = NULL;
struct udf_options uopt;
struct kernel_lb_addr rootdir, fileset;
@@ -2011,7 +2071,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
} else {
uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
- if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+ if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
if (!silent)
pr_notice("Rescanning with blocksize %d\n",
UDF_DEFAULT_BLOCKSIZE);
@@ -2021,8 +2081,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
ret = udf_load_vrs(sb, &uopt, silent, &fileset);
}
}
- if (!ret) {
- udf_warn(sb, "No partition found (1)\n");
+ if (ret < 0) {
+ if (ret == -EAGAIN) {
+ udf_warn(sb, "No partition found (1)\n");
+ ret = -EINVAL;
+ }
goto error_out;
}
@@ -2040,9 +2103,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
le16_to_cpu(lvidiu->minUDFReadRev),
UDF_MAX_READ_VERSION);
+ ret = -EINVAL;
+ goto error_out;
+ } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION &&
+ !(sb->s_flags & MS_RDONLY)) {
+ ret = -EACCES;
goto error_out;
- } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION)
- sb->s_flags |= MS_RDONLY;
+ }
sbi->s_udfrev = minUDFWriteRev;
@@ -2054,17 +2121,20 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (!sbi->s_partitions) {
udf_warn(sb, "No partition found (2)\n");
+ ret = -EINVAL;
goto error_out;
}
if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
- UDF_PART_FLAG_READ_ONLY) {
- pr_notice("Partition marked readonly; forcing readonly mount\n");
- sb->s_flags |= MS_RDONLY;
+ UDF_PART_FLAG_READ_ONLY &&
+ !(sb->s_flags & MS_RDONLY)) {
+ ret = -EACCES;
+ goto error_out;
}
if (udf_find_fileset(sb, &fileset, &rootdir)) {
udf_warn(sb, "No fileset found\n");
+ ret = -EINVAL;
goto error_out;
}
@@ -2086,6 +2156,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
if (!inode) {
udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
+ ret = -EIO;
goto error_out;
}
@@ -2093,6 +2164,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
udf_err(sb, "Couldn't allocate root dentry\n");
+ ret = -ENOMEM;
goto error_out;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -2113,7 +2185,7 @@ error_out:
kfree(sbi);
sb->s_fs_info = NULL;
- return -EINVAL;
+ return ret;
}
void _udf_err(struct super_block *sb, const char *function,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2f9fb421915a..977da0ec6604 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,14 +86,6 @@ xfs_destroy_ioend(
bh->b_end_io(bh, !ioend->io_error);
}
- if (ioend->io_iocb) {
- inode_dio_done(ioend->io_inode);
- if (ioend->io_isasync) {
- aio_complete(ioend->io_iocb, ioend->io_error ?
- ioend->io_error : ioend->io_result, 0);
- }
- }
-
mempool_free(ioend, xfs_ioend_pool);
}
@@ -281,7 +273,6 @@ xfs_alloc_ioend(
* all the I/O from calling the completion routine too early.
*/
atomic_set(&ioend->io_remaining, 1);
- ioend->io_isasync = 0;
ioend->io_isdirect = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
@@ -291,8 +282,6 @@ xfs_alloc_ioend(
ioend->io_buffer_tail = NULL;
ioend->io_offset = 0;
ioend->io_size = 0;
- ioend->io_iocb = NULL;
- ioend->io_result = 0;
ioend->io_append_trans = NULL;
INIT_WORK(&ioend->io_work, xfs_end_io);
@@ -1292,8 +1281,10 @@ __xfs_get_blocks(
if (create || !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
if (create && ISUNWRITTEN(&imap)) {
- if (direct)
+ if (direct) {
bh_result->b_private = inode;
+ set_buffer_defer_completion(bh_result);
+ }
set_buffer_unwritten(bh_result);
}
}
@@ -1390,9 +1381,7 @@ xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct xfs_ioend *ioend = iocb->private;
@@ -1414,17 +1403,10 @@ xfs_end_io_direct_write(
ioend->io_offset = offset;
ioend->io_size = size;
- ioend->io_iocb = iocb;
- ioend->io_result = ret;
if (private && size > 0)
ioend->io_type = XFS_IO_UNWRITTEN;
- if (is_async) {
- ioend->io_isasync = 1;
- xfs_finish_ioend(ioend);
- } else {
- xfs_finish_ioend_sync(ioend);
- }
+ xfs_finish_ioend_sync(ioend);
}
STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index c325abb8d61a..f94dd459dff9 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -45,7 +45,6 @@ typedef struct xfs_ioend {
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
- unsigned int io_isasync : 1; /* needs aio_complete */
unsigned int io_isdirect : 1;/* direct I/O */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
@@ -54,8 +53,6 @@ typedef struct xfs_ioend {
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
- struct kiocb *io_iocb;
- int io_result;
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;