aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c2
-rw-r--r--fs/9p/vfs_file.c3
-rw-r--r--fs/9p/vfs_super.c4
-rw-r--r--fs/afs/dynroot.c7
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/binfmt_elf.c16
-rw-r--r--fs/btrfs/extent_io.c13
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/qgroup.c38
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c8
-rw-r--r--fs/btrfs/tree-log.c36
-rw-r--r--fs/btrfs/volumes.c14
-rw-r--r--fs/ceph/mds_client.c21
-rw-r--r--fs/cifs/cifs_ioctl.h9
-rw-r--r--fs/cifs/cifsacl.h81
-rw-r--r--fs/cifs/cifsfs.c24
-rw-r--r--fs/cifs/cifsglob.h6
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/connect.c4
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c33
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/ioctl.c29
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/sess.c3
-rw-r--r--fs/cifs/smb2inode.c34
-rw-r--r--fs/cifs/smb2ops.c10
-rw-r--r--fs/cifs/smb2pdu.c23
-rw-r--r--fs/cifs/smb2proto.h7
-rw-r--r--fs/cifs/smbfsctl.h11
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/debugfs/file.c30
-rw-r--r--fs/debugfs/inode.c32
-rw-r--r--fs/direct-io.c3
-rw-r--r--fs/erofs/data.c10
-rw-r--r--fs/erofs/super.c4
-rw-r--r--fs/erofs/zdata.c12
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/inode.c3
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/fs-writeback.c11
-rw-r--r--fs/fuse/Kconfig11
-rw-r--r--fs/fuse/Makefile1
-rw-r--r--fs/fuse/fuse_i.h9
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/fuse/virtio_fs.c1195
-rw-r--r--fs/io_uring.c68
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/libfs.c140
-rw-r--r--fs/locks.c62
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/dir.c41
-rw-r--r--fs/nfs/direct.c106
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/nfs3proc.c45
-rw-r--r--fs/nfs/nfs4_fs.h11
-rw-r--r--fs/nfs/nfs4proc.c316
-rw-r--r--fs/nfs/nfs4state.c22
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/pnfs.c71
-rw-r--r--fs/nfs/pnfs.h17
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs/write.c5
-rw-r--r--fs/nfsd/Kconfig3
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/acl.h8
-rw-r--r--fs/nfsd/blocklayout.c3
-rw-r--r--fs/nfsd/export.c13
-rw-r--r--fs/nfsd/filecache.c934
-rw-r--r--fs/nfsd/filecache.h61
-rw-r--r--fs/nfsd/netns.h4
-rw-r--r--fs/nfsd/nfs3proc.c9
-rw-r--r--fs/nfsd/nfs3xdr.c13
-rw-r--r--fs/nfsd/nfs4callback.c35
-rw-r--r--fs/nfsd/nfs4layouts.c12
-rw-r--r--fs/nfsd/nfs4proc.c97
-rw-r--r--fs/nfsd/nfs4recover.c388
-rw-r--r--fs/nfsd/nfs4state.c239
-rw-r--r--fs/nfsd/nfs4xdr.c56
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfsproc.c4
-rw-r--r--fs/nfsd/nfssvc.c48
-rw-r--r--fs/nfsd/state.h13
-rw-r--r--fs/nfsd/trace.h140
-rw-r--r--fs/nfsd/vfs.c351
-rw-r--r--fs/nfsd/vfs.h37
-rw-r--r--fs/nfsd/xdr3.h2
-rw-r--r--fs/nfsd/xdr4.h19
-rw-r--r--fs/notify/fsnotify.h2
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/mark.c6
-rw-r--r--fs/ntfs/mft.c12
-rw-r--r--fs/ntfs/namei.c2
-rw-r--r--fs/ntfs/runlist.c2
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/aops.c25
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/xattr.c56
-rw-r--r--fs/open.c2
-rw-r--r--fs/proc/kcore.c6
-rw-r--r--fs/readdir.c128
-rw-r--r--fs/reiserfs/do_balan.c15
-rw-r--r--fs/reiserfs/fix_node.c6
-rw-r--r--fs/reiserfs/journal.c22
-rw-r--r--fs/reiserfs/lbalance.c3
-rw-r--r--fs/reiserfs/objectid.c3
-rw-r--r--fs/reiserfs/prints.c3
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/statfs.c17
-rw-r--r--fs/super.c5
-rw-r--r--fs/tracefs/inode.c4
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h7
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c19
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/scrub/alloc.c3
-rw-r--r--fs/xfs/scrub/refcount.c3
-rw-r--r--fs/xfs/xfs_bmap_util.c4
-rw-r--r--fs/xfs/xfs_buf.c16
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_sysfs.c13
136 files changed, 4470 insertions, 1241 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 995e332eee5c..eb2151fb6049 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -51,6 +51,8 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
if (!v9ses->cachetag) {
if (v9fs_random_cachetag(v9ses) < 0) {
v9ses->fscache = NULL;
+ kfree(v9ses->cachetag);
+ v9ses->cachetag = NULL;
return;
}
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 4cc966a31cb3..fe7f0bd2048e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -513,6 +513,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
v9inode = V9FS_I(inode);
mutex_lock(&v9inode->v_mutex);
if (!v9inode->writeback_fid &&
+ (vma->vm_flags & VM_SHARED) &&
(vma->vm_flags & VM_WRITE)) {
/*
* clone a fid and add it to writeback_fid
@@ -614,6 +615,8 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
(vma->vm_end - vma->vm_start - 1),
};
+ if (!(vma->vm_flags & VM_SHARED))
+ return;
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ca243e658d71..74df32be4c6a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -58,7 +58,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
static int
v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
- int flags, void *data)
+ int flags)
{
int ret;
@@ -132,7 +132,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(sb);
goto clunk_fid;
}
- retval = v9fs_fill_super(sb, v9ses, flags, data);
+ retval = v9fs_fill_super(sb, v9ses, flags);
if (retval)
goto release_sb;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index bcd1bafb0278..4150280509ff 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -10,13 +10,6 @@
#include <linux/dns_resolver.h>
#include "internal.h"
-const struct file_operations afs_dynroot_file_operations = {
- .open = dcache_dir_open,
- .release = dcache_dir_close,
- .iterate_shared = dcache_readdir,
- .llseek = dcache_dir_lseek,
-};
-
/*
* Probe to see if a cell may exist. This prevents positive dentries from
* being created unnecessarily.
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 7b1c18c32f48..46d2d7cb461d 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -443,7 +443,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
inode->i_op = &afs_dynroot_inode_operations;
- inode->i_fop = &afs_dynroot_file_operations;
+ inode->i_fop = &simple_dir_operations;
} else {
inode->i_op = &afs_autocell_inode_operations;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9cdfabaeaa0b..759e0578012c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -910,7 +910,6 @@ extern int afs_silly_iput(struct dentry *, struct inode *);
/*
* dynroot.c
*/
-extern const struct file_operations afs_dynroot_file_operations;
extern const struct inode_operations afs_dynroot_inode_operations;
extern const struct dentry_operations afs_dynroot_dentry_operations;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index cec3b4146440..c5642bcb6b46 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -879,7 +879,7 @@ out_free_interp:
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
- int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
+ int elf_prot, elf_flags;
unsigned long k, vaddr;
unsigned long total_size = 0;
@@ -911,13 +911,6 @@ out_free_interp:
*/
}
}
-
- /*
- * Some binaries have overlapping elf segments and then
- * we have to forcefully map over an existing mapping
- * e.g. over this newly established brk mapping.
- */
- elf_fixed = MAP_FIXED;
}
elf_prot = make_prot(elf_ppnt->p_flags);
@@ -930,7 +923,7 @@ out_free_interp:
* the ET_DYN load_addr calculations, proceed normally.
*/
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
- elf_flags |= elf_fixed;
+ elf_flags |= MAP_FIXED;
} else if (loc->elf_ex.e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
@@ -966,7 +959,7 @@ out_free_interp:
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
- elf_flags |= elf_fixed;
+ elf_flags |= MAP_FIXED;
} else
load_bias = 0;
@@ -1121,7 +1114,8 @@ out_free_interp:
* (since it grows up, and may collide early with the stack
* growing down), and into the unused ELF_ET_DYN_BASE region.
*/
- if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter)
+ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+ loc->elf_ex.e_type == ET_DYN && !interpreter)
current->mm->brk = current->mm->start_brk =
ELF_ET_DYN_BASE;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7b32b6af322d..cceaf05aada2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3745,12 +3745,21 @@ err_unlock:
static void set_btree_ioerr(struct page *page)
{
struct extent_buffer *eb = (struct extent_buffer *)page->private;
+ struct btrfs_fs_info *fs_info;
SetPageError(page);
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
/*
+ * If we error out, we should add back the dirty_metadata_bytes
+ * to make it consistent.
+ */
+ fs_info = eb->fs_info;
+ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+ eb->len, fs_info->dirty_metadata_batch);
+
+ /*
* If writeback for a btree extent that doesn't belong to a log tree
* failed, increment the counter transaction->eb_write_errors.
* We do this because while the transaction is running and before it's
@@ -3986,6 +3995,10 @@ retry:
if (!ret) {
free_extent_buffer(eb);
continue;
+ } else if (ret < 0) {
+ done = 1;
+ free_extent_buffer(eb);
+ break;
}
ret = write_one_eb(eb, wbc, &epd);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8fe4eb7e5045..27e5b269e729 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
- struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
@@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
+ struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
@@ -1758,9 +1758,20 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, &cached_state);
+
+ /*
+ * If we have not locked the extent range, because the range's
+ * start offset is >= i_size, we might still have a non-NULL
+ * cached extent state, acquired while marking the extent range
+ * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * possible cached extent state to avoid a memory leak.
+ */
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
true);
if (ret) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0546401bc0a..0f2754eaa05b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6305,13 +6305,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
u32 sizes[2];
int nitems = name ? 2 : 1;
unsigned long ptr;
+ unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
+ nofs_flag = memalloc_nofs_save();
inode = new_inode(fs_info->sb);
+ memalloc_nofs_restore(nofs_flag);
if (!inode) {
btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 8d3bd799ac7d..c4bb69941c77 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3166,9 +3166,6 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!btrfs_fs_closing(fs_info))
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
-
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -3184,16 +3181,30 @@ out:
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
err);
- goto done;
}
- ret = update_qgroup_status_item(trans);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d", err);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (trans) {
+ ret = update_qgroup_status_item(trans);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d",
+ err);
+ }
}
+ fs_info->qgroup_rescan_running = false;
+ complete_all(&fs_info->qgroup_rescan_completion);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (!trans)
+ return;
+
btrfs_end_transaction(trans);
if (btrfs_fs_closing(fs_info)) {
@@ -3204,12 +3215,6 @@ out:
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
-
-done:
- mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = false;
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- complete_all(&fs_info->qgroup_rescan_completion);
}
/*
@@ -3437,6 +3442,9 @@ cleanup:
while ((unode = ulist_next(&reserved->range_changed, &uiter)))
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
+ /* Also free data bytes of already reserved one */
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
+ orig_reserved, BTRFS_QGROUP_RSV_DATA);
extent_changeset_release(reserved);
return ret;
}
@@ -3481,7 +3489,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
free_start, free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e87cbdad02a3..b57f3618e58e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret;
+ int i = 0, tree_block_level = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2f0e25afa486..00504657b602 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1435,6 +1435,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int clear_rsv = 0;
int ret;
+ /*
+ * The subvolume has reloc tree but the swap is finished, no need to
+ * create/update the dead reloc tree
+ */
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
+ return 0;
+
if (root->reloc_root) {
reloc_root = root->reloc_root;
reloc_root->last_trans = trans->transid;
@@ -2187,7 +2194,6 @@ static int clean_dirty_subvols(struct reloc_control *rc)
/* Merged subvolume, cleanup its reloc root */
struct btrfs_root *reloc_root = root->reloc_root;
- clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
list_del_init(&root->reloc_dirty_list);
root->reloc_root = NULL;
if (reloc_root) {
@@ -2196,6 +2202,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
if (ret2 < 0 && !ret)
ret = ret2;
}
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
btrfs_put_fs_root(root);
} else {
/* Orphan reloc tree, just clean it up */
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f3215028235c..123ac54af071 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5085,7 +5085,7 @@ static int clone_range(struct send_ctx *sctx,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- u64 clone_src_i_size;
+ u64 clone_src_i_size = 0;
/*
* Prevent cloning from a zero offset with a length matching the sector
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index b5e80563efaa..99fe9bf3fdac 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -52,7 +52,13 @@ static struct file_system_type test_type = {
struct inode *btrfs_new_test_inode(void)
{
- return new_inode(test_mnt->mnt_sb);
+ struct inode *inode;
+
+ inode = new_inode(test_mnt->mnt_sb);
+ if (inode)
+ inode_init_owner(inode, NULL, S_IFREG);
+
+ return inode;
}
static int btrfs_init_test_fs(void)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 29b82a795522..8a6cc600bf18 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2932,7 +2932,8 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans,
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -3104,18 +3106,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
- * Update or create log root item under the root's log_mutex to prevent
- * races with concurrent log syncs that can lead to failure to update
- * log root item because it was not created yet.
- */
- ret = update_log_root(trans, log);
- /*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
@@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&log_root_tree->log_writer_wait);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a324480bc88b..bdfe4493e43a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3845,7 +3845,11 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return is_power_of_2(flags);
+ /*
+ * Don't use is_power_of_2(unsigned long) because it won't work
+ * for the single profile (1ULL << 48) on 32-bit CPUs.
+ */
+ return flags != 0 && (flags & (flags - 1)) == 0;
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -4063,7 +4067,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
num_devices = btrfs_num_devices(fs_info);
- allowed = 0;
+
+ /*
+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
+ * special bit for it, to make it easier to distinguish. Thus we need
+ * to set it manually, or balance would refuse the profile.
+ */
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
if (num_devices >= btrfs_raid_array[i].devs_min)
allowed |= btrfs_raid_array[i].bg_flag;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a8a8f84f3bbf..a5163296d9d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -384,8 +384,8 @@ static int parse_reply_info_readdir(void **p, void *end,
}
done:
- if (*p != end)
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
bad:
@@ -406,12 +406,10 @@ static int parse_reply_info_filelock(void **p, void *end,
goto bad;
info->filelock_reply = *p;
- *p += sizeof(*info->filelock_reply);
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
-
bad:
return -EIO;
}
@@ -425,18 +423,21 @@ static int parse_reply_info_create(void **p, void *end,
{
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
+ /* Malformed reply? */
if (*p == end) {
info->has_create_ino = false;
} else {
info->has_create_ino = true;
- info->ino = ceph_decode_64(p);
+ ceph_decode_64_safe(p, end, info->ino, bad);
}
+ } else {
+ if (*p != end)
+ goto bad;
}
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
-
bad:
return -EIO;
}
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 6c3bd07868d7..0f0dc1c1fe41 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -57,9 +57,18 @@ struct smb_query_info {
/* char buffer[]; */
} __packed;
+struct smb3_key_debug_info {
+ __u64 Suid;
+ __u16 cipher_type;
+ __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */
+ __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
+ __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+} __packed;
+
#define CIFS_IOCTL_MAGIC 0xCF
#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array)
#define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info)
+#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info)
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index eb428349f29a..439b99cefeb0 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -90,8 +90,39 @@ struct cifs_acl {
__le32 num_aces;
} __attribute__((packed));
+/* ACE types - see MS-DTYP 2.4.4.1 */
+#define ACCESS_ALLOWED_ACE_TYPE 0x00
+#define ACCESS_DENIED_ACE_TYPE 0x01
+#define SYSTEM_AUDIT_ACE_TYPE 0x02
+#define SYSTEM_ALARM_ACE_TYPE 0x03
+#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
+#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
+#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
+#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
+#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
+#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
+#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
+#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */
+#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */
+#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11
+#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12
+#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13
+
+/* ACE flags */
+#define OBJECT_INHERIT_ACE 0x01
+#define CONTAINER_INHERIT_ACE 0x02
+#define NO_PROPAGATE_INHERIT_ACE 0x04
+#define INHERIT_ONLY_ACE 0x08
+#define INHERITED_ACE 0x10
+#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40
+#define FAILED_ACCESS_ACE_FLAG 0x80
+
struct cifs_ace {
- __u8 type;
+ __u8 type; /* see above and MS-DTYP 2.4.4.1 */
__u8 flags;
__le16 size;
__le32 access_req;
@@ -99,6 +130,54 @@ struct cifs_ace {
} __attribute__((packed));
/*
+ * The current SMB3 form of security descriptor is similar to what was used for
+ * cifs (see above) but some fields are split, and fields in the struct below
+ * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and
+ * 2.4.6). Note that "CamelCase" fields are used in this struct in order to
+ * match the MS-DTYP and MS-SMB2 specs which define the wire format.
+ */
+struct smb3_sd {
+ __u8 Revision; /* revision level, MUST be one */
+ __u8 Sbz1; /* only meaningful if 'RM' flag set below */
+ __le16 Control;
+ __le32 OffsetOwner;
+ __le32 OffsetGroup;
+ __le32 OffsetSacl;
+ __le32 OffsetDacl;
+} __packed;
+
+/* Meaning of 'Control' field flags */
+#define ACL_CONTROL_SR 0x0001 /* Self relative */
+#define ACL_CONTROL_RM 0x0002 /* Resource manager control bits */
+#define ACL_CONTROL_PS 0x0004 /* SACL protected from inherits */
+#define ACL_CONTROL_PD 0x0008 /* DACL protected from inherits */
+#define ACL_CONTROL_SI 0x0010 /* SACL Auto-Inherited */
+#define ACL_CONTROL_DI 0x0020 /* DACL Auto-Inherited */
+#define ACL_CONTROL_SC 0x0040 /* SACL computed through inheritance */
+#define ACL_CONTROL_DC 0x0080 /* DACL computed through inheritence */
+#define ACL_CONTROL_SS 0x0100 /* Create server ACL */
+#define ACL_CONTROL_DT 0x0200 /* DACL provided by trusteed source */
+#define ACL_CONTROL_SD 0x0400 /* SACL defaulted */
+#define ACL_CONTROL_SP 0x0800 /* SACL is present on object */
+#define ACL_CONTROL_DD 0x1000 /* DACL defaulted */
+#define ACL_CONTROL_DP 0x2000 /* DACL is present on object */
+#define ACL_CONTROL_GD 0x4000 /* Group was defaulted */
+#define ACL_CONTROL_OD 0x8000 /* User was defaulted */
+
+/* Meaning of AclRevision flags */
+#define ACL_REVISION 0x02 /* See section 2.4.4.1 of MS-DTYP */
+#define ACL_REVISION_DS 0x04 /* Additional AceTypes allowed */
+
+struct smb3_acl {
+ u8 AclRevision; /* revision level */
+ u8 Sbz1; /* MBZ */
+ __le16 AclSize;
+ __le16 AceCount;
+ __le16 Sbz2; /* MBZ */
+} __packed;
+
+
+/*
* Minimum security identifier can be one for system defined Users
* and Groups such as NULL SID and World or Built-in accounts such
* as Administrator and Guest and consists of
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2e9c7f493f99..c049c7b3aa87 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -169,18 +169,26 @@ cifs_read_super(struct super_block *sb)
else
sb->s_maxbytes = MAX_NON_LFS;
- /* BB FIXME fix time_gran to be larger for LANMAN sessions */
- sb->s_time_gran = 100;
-
- if (tcon->unix_ext) {
- ts = cifs_NTtimeToUnix(0);
+ /* Some very old servers like DOS and OS/2 used 2 second granularity */
+ if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) &&
+ ((tcon->ses->capabilities &
+ tcon->ses->server->vals->cap_nt_find) == 0) &&
+ !tcon->unix_ext) {
+ sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */
+ ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
sb->s_time_min = ts.tv_sec;
- ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
+ ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX),
+ cpu_to_le16(SMB_TIME_MAX), 0);
sb->s_time_max = ts.tv_sec;
} else {
- ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
+ /*
+ * Almost every server, including all SMB2+, uses DCE TIME
+ * ie 100 nanosecond units, since 1601. See MS-DTYP and MS-FSCC
+ */
+ sb->s_time_gran = 100;
+ ts = cifs_NTtimeToUnix(0);
sb->s_time_min = ts.tv_sec;
- ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), cpu_to_le16(SMB_TIME_MAX), 0);
+ ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
sb->s_time_max = ts.tv_sec;
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 54e204589cb9..50dfd9049370 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -331,8 +331,9 @@ struct smb_version_operations {
umode_t mode, struct cifs_tcon *tcon,
const char *full_path,
struct cifs_sb_info *cifs_sb);
- int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *,
- struct cifs_sb_info *);
+ int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode,
+ struct cifs_tcon *tcon, const char *name,
+ struct cifs_sb_info *sb);
/* set info on created directory */
void (*mkdir_setinfo)(struct inode *, const char *,
struct cifs_sb_info *, struct cifs_tcon *,
@@ -1209,6 +1210,7 @@ struct cifs_search_info {
bool smallBuf:1; /* so we know which buf_release function to call */
};
+#define ACL_NO_MODE ((umode_t)(-1))
struct cifs_open_parms {
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 99b1b1ef558c..e53e9f62b87b 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -372,7 +372,8 @@ extern int CIFSSMBUnixSetPathInfo(const unsigned int xid,
const struct nls_table *nls_codepage,
int remap);
-extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
+extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode,
+ umode_t mode, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index dbee2132e419..4f554f019a98 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1078,7 +1078,8 @@ RmDirRetry:
}
int
-CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode,
+ struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
int rc = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2850c3ce4391..a64dfa95a925 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -4264,7 +4264,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
server->ops->qfs_tcon(*xid, tcon);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
if (tcon->fsDevInfo.DeviceCharacteristics &
- FILE_READ_ONLY_DEVICE)
+ cpu_to_le32(FILE_READ_ONLY_DEVICE))
cifs_dbg(VFS, "mounted to read only share\n");
else if ((cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_RW_CACHE) == 0)
@@ -4445,7 +4445,7 @@ static int setup_dfs_tgt_conn(const char *path,
int rc;
struct dfs_info3_param ref = {0};
char *mdata = NULL, *fake_devname = NULL;
- struct smb_vol fake_vol = {0};
+ struct smb_vol fake_vol = {NULL};
cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index dd5ac841aefa..7ce689d31aa2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -738,10 +738,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
static int
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
+ struct inode *inode;
+
if (flags & LOOKUP_RCU)
return -ECHILD;
if (d_really_is_positive(direntry)) {
+ inode = d_inode(direntry);
+ if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
+ CIFS_I(inode)->time = 0; /* force reval */
+
if (cifs_revalidate_dentry(direntry))
return 0;
else {
@@ -752,7 +758,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
* attributes will have been updated by
* cifs_revalidate_dentry().
*/
- if (IS_AUTOMOUNT(d_inode(direntry)) &&
+ if (IS_AUTOMOUNT(inode) &&
!(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
spin_lock(&direntry->d_lock);
direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4b95700c507c..5ad15de2bb4f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -253,6 +253,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
xid, fid);
+ if (rc) {
+ server->ops->close(xid, tcon, fid);
+ if (rc == -ESTALE)
+ rc = -EOPENSTALE;
+ }
+
out:
kfree(buf);
return rc;
@@ -1840,13 +1846,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
{
struct cifsFileInfo *open_file = NULL;
struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
/* we could simply get the first_list_entry since write-only entries
are always at the end of the list but since the first entry might
have a close pending, we go through the whole list */
@@ -1858,7 +1863,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
/* found a good file */
/* lock it so it will not be closed on us */
cifsFileInfo_get(open_file);
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return open_file;
} /* else might as well continue, and look for
another, or simply have the caller reopen it
@@ -1866,7 +1871,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
} else /* write only file */
break; /* write only files are last so must be done */
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return NULL;
}
@@ -1877,7 +1882,6 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
bool any_available = false;
int rc = -EBADF;
unsigned int refind = 0;
@@ -1897,16 +1901,15 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
- tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
refind_writable:
if (refind > MAX_REOPEN_ATT) {
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return rc;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
@@ -1918,7 +1921,7 @@ refind_writable:
if (!open_file->invalidHandle) {
/* found a good writable file */
cifsFileInfo_get(open_file);
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
*ret_file = open_file;
return 0;
} else {
@@ -1938,7 +1941,7 @@ refind_writable:
cifsFileInfo_get(inv_file);
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
@@ -1953,7 +1956,7 @@ refind_writable:
cifsFileInfo_put(inv_file);
++refind;
inv_file = NULL;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
goto refind_writable;
}
@@ -4461,17 +4464,15 @@ static int cifs_readpage(struct file *file, struct page *page)
static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *open_file;
- struct cifs_tcon *tcon =
- cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb));
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return 1;
}
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return 0;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 26cdfbf1e164..5dcc95b38310 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -414,6 +414,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -421,6 +422,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if filetype is different, return error */
if (unlikely(((*pinode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -933,6 +935,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
@@ -940,6 +943,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if filetype is different, return error */
if (unlikely(((*inode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
@@ -1622,13 +1626,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
}
/* BB add setting the equivalent of mode via CreateX w/ACLs */
- rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
+ rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb);
if (rc) {
cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc);
d_drop(direntry);
goto mkdir_out;
}
+ /* TODO: skip this for smb2/smb3 */
rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon,
xid);
mkdir_out:
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 76ddd98b6298..1a01e108d75e 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -164,6 +164,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
{
struct inode *inode = file_inode(filep);
+ struct smb3_key_debug_info pkey_inf;
int rc = -ENOTTY; /* strange error - but the precedent */
unsigned int xid;
struct cifsFileInfo *pSMBFile = filep->private_data;
@@ -270,6 +271,34 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
else
rc = -EOPNOTSUPP;
break;
+ case CIFS_DUMP_KEY:
+ if (pSMBFile == NULL)
+ break;
+ if (!capable(CAP_SYS_ADMIN)) {
+ rc = -EACCES;
+ break;
+ }
+
+ tcon = tlink_tcon(pSMBFile->tlink);
+ if (!smb3_encryption_required(tcon)) {
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ pkey_inf.cipher_type =
+ le16_to_cpu(tcon->ses->server->cipher_type);
+ pkey_inf.Suid = tcon->ses->Suid;
+ memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response,
+ 16 /* SMB2_NTLMV2_SESSKEY_SIZE */);
+ memcpy(pkey_inf.smb3decryptionkey,
+ tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+ memcpy(pkey_inf.smb3encryptionkey,
+ tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE);
+ if (copy_to_user((void __user *)arg, &pkey_inf,
+ sizeof(struct smb3_key_debug_info)))
+ rc = -EFAULT;
+ else
+ rc = 0;
+ break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
break;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 49c17ee18254..9b41436fb8db 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
{0, 0}
};
-static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
- {0, 0}
-};
-
/*
* Convert a string containing text IPv4 or IPv6 address to binary form.
*
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4c764ff7edd2..85bd644f9773 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -698,7 +698,6 @@ sess_auth_lanman(struct sess_data *sess_data)
char *bcc_ptr;
struct cifs_ses *ses = sess_data->ses;
char lnm_session_key[CIFS_AUTH_RESP_SIZE];
- __u32 capabilities;
__u16 bytes_remaining;
/* lanman 2 style sessionsetup */
@@ -709,7 +708,7 @@ sess_auth_lanman(struct sess_data *sess_data)
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
bcc_ptr = sess_data->iov[2].iov_base;
- capabilities = cifs_ssetup_hdr(ses, pSMB);
+ (void)cifs_ssetup_hdr(ses, pSMB);
pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index d2a3fb7e5c8d..4121ac1163ca 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -51,7 +51,7 @@ static int
smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
__u32 desired_access, __u32 create_disposition,
- __u32 create_options, void *ptr, int command,
+ __u32 create_options, umode_t mode, void *ptr, int command,
struct cifsFileInfo *cfile)
{
int rc;
@@ -103,6 +103,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
oparms.fid = &fid;
oparms.reconnect = false;
+ oparms.mode = mode;
memset(&open_iov, 0, sizeof(open_iov));
rqst[num_rqst].rq_iov = open_iov;
@@ -478,7 +479,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
- smb2_data, SMB2_OP_QUERY_INFO, cfile);
+ ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile);
if (rc == -EOPNOTSUPP) {
*symlink = true;
create_options |= OPEN_REPARSE_POINT;
@@ -486,8 +487,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
/* Failed on a symbolic link - query a reparse point info */
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, smb2_data,
- SMB2_OP_QUERY_INFO, NULL);
+ create_options, ACL_NO_MODE,
+ smb2_data, SMB2_OP_QUERY_INFO, NULL);
}
if (rc)
goto out;
@@ -499,12 +500,14 @@ out:
}
int
-smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
+ struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL);
+ CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
+ NULL);
}
void
@@ -525,8 +528,8 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
cifs_get_writable_path(tcon, name, &cfile);
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO,
- cfile);
+ CREATE_NOT_FILE, ACL_NO_MODE,
+ &data, SMB2_OP_SET_INFO, cfile);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -536,7 +539,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_NOT_FILE,
+ CREATE_NOT_FILE, ACL_NO_MODE,
NULL, SMB2_OP_RMDIR, NULL);
}
@@ -546,7 +549,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- NULL, SMB2_OP_DELETE, NULL);
+ ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL);
}
static int
@@ -564,7 +567,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
goto smb2_rename_path;
}
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, 0, smb2_to_name, command, cfile);
+ FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
+ command, cfile);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -601,8 +605,8 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
__le64 eof = cpu_to_le64(size);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
- SMB2_OP_SET_EOF, NULL);
+ FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
+ &eof, SMB2_OP_SET_EOF, NULL);
}
int
@@ -623,8 +627,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
return PTR_ERR(tlink);
rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path,
- FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
- SMB2_OP_SET_INFO, NULL);
+ FILE_WRITE_ATTRIBUTES, FILE_OPEN,
+ 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index eaed18061314..4c0922596467 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -751,6 +751,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
goto oshr_exit;
}
+ atomic_inc(&tcon->num_remote_opens);
+
o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
oparms.fid->persistent_fid = o_rsp->PersistentFileId;
oparms.fid->volatile_fid = o_rsp->VolatileFileId;
@@ -1176,6 +1178,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
rc = compound_send_recv(xid, ses, flags, 3, rqst,
resp_buftype, rsp_iov);
+ /* no need to bump num_remote_opens because handle immediately closed */
sea_exit:
kfree(ea);
@@ -1518,6 +1521,8 @@ smb2_ioctl_query_info(const unsigned int xid,
resp_buftype, rsp_iov);
if (rc)
goto iqinf_exit;
+
+ /* No need to bump num_remote_opens since handle immediately closed */
if (qi.flags & PASSTHRU_FSCTL) {
pqi = (struct smb_query_info __user *)arg;
io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base;
@@ -3328,6 +3333,11 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
+ /* Check if the server granted an oplock rather than a lease */
+ if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+ return smb2_set_oplock_level(cinode, oplock, epoch,
+ purge_cache);
+
if (oplock & SMB2_LEASE_READ_CACHING_HE) {
new_oplock |= CIFS_CACHE_READ_FLG;
strcat(message, "R");
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 87066f1af12c..05149862aea4 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -751,6 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
unsigned int num = *num_iovec;
iov[num].iov_base = create_posix_buf(mode);
+ if (mode == ACL_NO_MODE)
+ cifs_dbg(FYI, "illegal mode\n");
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_posix);
@@ -2352,6 +2354,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
rqst.rq_iov = iov;
rqst.rq_nvec = n_iov;
+ /* no need to inc num_remote_opens because we close it just below */
trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE,
FILE_WRITE_ATTRIBUTES);
/* resource #4: response buffer */
@@ -2416,6 +2419,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock,
/* File attributes ignored on open (used in create though) */
req->FileAttributes = cpu_to_le32(file_attributes);
req->ShareAccess = FILE_SHARE_ALL_LE;
+
req->CreateDisposition = cpu_to_le32(oparms->disposition);
req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req));
@@ -2517,6 +2521,20 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock,
return rc;
}
+ if ((oparms->disposition == FILE_CREATE) &&
+ (oparms->mode != ACL_NO_MODE)) {
+ if (n_iov > 2) {
+ struct create_context *ccontext =
+ (struct create_context *)iov[n_iov-1].iov_base;
+ ccontext->Next =
+ cpu_to_le32(iov[n_iov-1].iov_len);
+ }
+
+ /* rc = add_sd_context(iov, &n_iov, oparms->mode); */
+ if (rc)
+ return rc;
+ }
+
if (n_iov > 2) {
struct create_context *ccontext =
(struct create_context *)iov[n_iov-1].iov_base;
@@ -3180,7 +3198,7 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
* See MS-SMB2 2.2.35 and 2.2.36
*/
-int
+static int
SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid,
u32 completion_filter, bool watch_tree)
@@ -3196,7 +3214,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
- req->OutputBufferLength = SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE;
+ req->OutputBufferLength =
+ cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
req->CompletionFilter = cpu_to_le32(completion_filter);
if (watch_tree)
req->Flags = cpu_to_le16(SMB2_WATCH_TREE);
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 67a91b11fd59..71b2930b8e0b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -84,7 +84,8 @@ extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
umode_t mode, struct cifs_tcon *tcon,
const char *full_path,
struct cifs_sb_info *cifs_sb);
-extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
+extern int smb2_mkdir(const unsigned int xid, struct inode *inode,
+ umode_t mode, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
struct cifs_sb_info *cifs_sb,
@@ -149,6 +150,10 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
bool is_fsctl, char *in_data, u32 indatalen,
__u32 max_response_size);
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
+extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, bool watch_tree,
+ u32 completion_filter);
+
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 08628e6a42ac..1ff28529cf4b 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -144,6 +144,17 @@
#define IO_REPARSE_APPXSTREAM 0xC0000014
/* NFS symlinks, Win 8/SMB3 and later */
#define IO_REPARSE_TAG_NFS 0x80000014
+/*
+ * AzureFileSync - see
+ * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering
+ */
+#define IO_REPARSE_TAG_AZ_FILE_SYNC 0x8000001e
+/* WSL reparse tags */
+#define IO_REPARSE_TAG_LX_SYMLINK 0xA000001D
+#define IO_REPARSE_TAG_AF_UNIX 0x80000023
+#define IO_REPARSE_TAG_LX_FIFO 0x80000024
+#define IO_REPARSE_TAG_LX_CHR 0x80000025
+#define IO_REPARSE_TAG_LX_BLK 0x80000026
/* fsctl flags */
/* If Flags is set to this value, the request is an FSCTL not ioctl request */
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 9076150758d8..db4ba8f6077e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -31,7 +31,7 @@
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
-#define MAX_EA_VALUE_SIZE 65535
+#define MAX_EA_VALUE_SIZE CIFSMaxBufSize
#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */
#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 93e4ca6b2ad7..87846aad594b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -19,6 +19,7 @@
#include <linux/atomic.h>
#include <linux/device.h>
#include <linux/poll.h>
+#include <linux/security.h>
#include "internal.h"
@@ -136,6 +137,25 @@ void debugfs_file_put(struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(debugfs_file_put);
+/*
+ * Only permit access to world-readable files when the kernel is locked down.
+ * We also need to exclude any file that has ways to write or alter it as root
+ * can bypass the permissions check.
+ */
+static bool debugfs_is_locked_down(struct inode *inode,
+ struct file *filp,
+ const struct file_operations *real_fops)
+{
+ if ((inode->i_mode & 07777) == 0444 &&
+ !(filp->f_mode & FMODE_WRITE) &&
+ !real_fops->unlocked_ioctl &&
+ !real_fops->compat_ioctl &&
+ !real_fops->mmap)
+ return false;
+
+ return security_locked_down(LOCKDOWN_DEBUGFS);
+}
+
static int open_proxy_open(struct inode *inode, struct file *filp)
{
struct dentry *dentry = F_DENTRY(filp);
@@ -147,6 +167,11 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
return r == -EIO ? -ENOENT : r;
real_fops = debugfs_real_fops(filp);
+
+ r = debugfs_is_locked_down(inode, filp, real_fops);
+ if (r)
+ goto out;
+
real_fops = fops_get(real_fops);
if (!real_fops) {
/* Huh? Module did not clean up after itself at exit? */
@@ -272,6 +297,11 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
return r == -EIO ? -ENOENT : r;
real_fops = debugfs_real_fops(filp);
+
+ r = debugfs_is_locked_down(inode, filp, real_fops);
+ if (r)
+ goto out;
+
real_fops = fops_get(real_fops);
if (!real_fops) {
/* Huh? Module did not cleanup after itself at exit? */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 042b688ed124..7b975dbb2bb4 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -26,6 +26,7 @@
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "internal.h"
@@ -35,6 +36,32 @@ static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
+/*
+ * Don't allow access attributes to be changed whilst the kernel is locked down
+ * so that we can use the file mode as part of a heuristic to determine whether
+ * to lock down individual files.
+ */
+static int debugfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+ int ret = security_locked_down(LOCKDOWN_DEBUGFS);
+
+ if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
+ return ret;
+ return simple_setattr(dentry, ia);
+}
+
+static const struct inode_operations debugfs_file_inode_operations = {
+ .setattr = debugfs_setattr,
+};
+static const struct inode_operations debugfs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .setattr = debugfs_setattr,
+};
+static const struct inode_operations debugfs_symlink_inode_operations = {
+ .get_link = simple_get_link,
+ .setattr = debugfs_setattr,
+};
+
static struct inode *debugfs_get_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -369,6 +396,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
inode->i_mode = mode;
inode->i_private = data;
+ inode->i_op = &debugfs_file_inode_operations;
inode->i_fop = proxy_fops;
dentry->d_fsdata = (void *)((unsigned long)real_fops |
DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
@@ -532,7 +560,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
}
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
- inode->i_op = &simple_dir_inode_operations;
+ inode->i_op = &debugfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -632,7 +660,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
return failed_creating(dentry);
}
inode->i_mode = S_IFLNK | S_IRWXUGO;
- inode->i_op = &simple_symlink_inode_operations;
+ inode->i_op = &debugfs_symlink_inode_operations;
inode->i_link = link;
d_instantiate(dentry, inode);
return end_creating(dentry);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ae196784f487..9329ced91f1d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -241,9 +241,8 @@ void dio_warn_stale_pagecache(struct file *filp)
}
}
-/**
+/*
* dio_complete() - called when all DIO BIO I/O has been completed
- * @offset: the byte offset in the file of the completed operation
*
* This drops i_dio_count, lets interested parties know that a DIO operation
* has completed, and calculates the resulting return code for the operation.
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8a9fcbd0e8ac..fc3a8d8064f8 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -34,11 +34,15 @@ static void erofs_readendio(struct bio *bio)
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
{
- struct inode *const bd_inode = sb->s_bdev->bd_inode;
- struct address_space *const mapping = bd_inode->i_mapping;
+ struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct page *page;
- return read_cache_page_gfp(mapping, blkaddr,
+ page = read_cache_page_gfp(mapping, blkaddr,
mapping_gfp_constraint(mapping, ~__GFP_FS));
+ /* should already be PageUptodate */
+ if (!IS_ERR(page))
+ lock_page(page);
+ return page;
}
static int erofs_map_blocks_flatmode(struct inode *inode,
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index caf9a95173b0..0e369494f2f2 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -105,9 +105,9 @@ static int erofs_read_superblock(struct super_block *sb)
int ret;
page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
- if (!page) {
+ if (IS_ERR(page)) {
erofs_err(sb, "cannot read erofs superblock");
- return -EIO;
+ return PTR_ERR(page);
}
sbi = EROFS_SB(sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 96e34c90f814..fad80c97d247 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -575,7 +575,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct erofs_map_blocks *const map = &fe->map;
struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
- bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED);
+ bool tight = true;
enum z_erofs_cache_alloctype cache_strategy;
enum z_erofs_page_type page_type;
@@ -628,8 +628,16 @@ restart_now:
preload_compressed_pages(clt, MNGD_MAPPING(sbi),
cache_strategy, pagepool);
- tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED);
hitted:
+ /*
+ * Ensure the current partial page belongs to this submit chain rather
+ * than other concurrent submit chains or the noio(bypass) chain since
+ * those chains are handled asynchronously thus the page cannot be used
+ * for inplace I/O or pagevec (should be processed in strict order.)
+ */
+ tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
+ clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
diff --git a/fs/exec.c b/fs/exec.c
index f7f6a140856a..555e93c7dec8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm)
}
task_lock(tsk);
active_mm = tsk->active_mm;
+ membarrier_exec_mmap(mm);
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
@@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
- membarrier_execve(current);
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 123e3dee7733..516faa280ced 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4551,6 +4551,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
struct buffer_head *bh;
struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
+ struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
@@ -4639,6 +4640,7 @@ make_io:
* If we need to do any I/O, try to pre-readahead extra
* blocks from the inode table.
*/
+ blk_start_plug(&plug);
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
@@ -4669,6 +4671,7 @@ make_io:
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE_BLOCK(inode, block,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 814ad2c2ba80..054acd9fd033 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -88,9 +88,7 @@ static int fat__get_entry(struct inode *dir, loff_t *pos,
int err, offset;
next:
- if (*bh)
- brelse(*bh);
-
+ brelse(*bh);
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 0ee727485615..01263ffbc4c0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -246,7 +246,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
* sys_open_by_handle_at: Open the file handle
* @mountdirfd: directory file descriptor
* @handle: file handle to be opened
- * @flag: open flags.
+ * @flags: open flags.
*
* @mountdirfd indicate the directory file descriptor
* of the mount point. file handle is decoded relative
diff --git a/fs/file_table.c b/fs/file_table.c
index b07b53f24ff5..30d55c9a1744 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -327,6 +327,7 @@ void flush_delayed_fput(void)
{
delayed_fput(NULL);
}
+EXPORT_SYMBOL_GPL(flush_delayed_fput);
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8aaa7eec7b74..8461a6322039 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb,
if (work->auto_free)
kfree(work);
- if (done && atomic_dec_and_test(&done->cnt))
- wake_up_all(done->waitq);
+ if (done) {
+ wait_queue_head_t *waitq = done->waitq;
+
+ /* @done can't be accessed after the following dec */
+ if (atomic_dec_and_test(&done->cnt))
+ wake_up_all(waitq);
+ }
}
static void wb_queue_work(struct bdi_writeback *wb,
@@ -900,7 +905,7 @@ restart:
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
* @bdi_id: target bdi id
* @memcg_id: target memcg css id
- * @nr_pages: number of pages to write, 0 for best-effort dirty flushing
+ * @nr: number of pages to write, 0 for best-effort dirty flushing
* @reason: reason why some writeback work initiated
* @done: target wb_completion
*
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 24fc5a5c1b97..0635cba19971 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -27,3 +27,14 @@ config CUSE
If you want to develop or use a userspace character device
based on CUSE, answer Y or M.
+
+config VIRTIO_FS
+ tristate "Virtio Filesystem"
+ depends on FUSE_FS
+ select VIRTIO
+ help
+ The Virtio Filesystem allows guests to mount file systems from the
+ host.
+
+ If you want to share files between guests or with the host, answer Y
+ or M.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 9485019c2a14..6419a2b3510d 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,5 +5,6 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
+obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fc89cb40e874..956aeaf961ae 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -353,6 +353,10 @@ struct fuse_req {
/** Used to wake up the task waiting for completion of request*/
wait_queue_head_t waitq;
+#if IS_ENABLED(CONFIG_VIRTIO_FS)
+ /** virtio-fs's physically contiguous buffer for in and out args */
+ void *argbuf;
+#endif
};
struct fuse_iqueue;
@@ -383,6 +387,11 @@ struct fuse_iqueue_ops {
*/
void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
__releases(fiq->lock);
+
+ /**
+ * Clean up when fuse_iqueue is destroyed
+ */
+ void (*release)(struct fuse_iqueue *fiq);
};
/** /dev/fuse input queue operations */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 51cb471f4dc3..e040e2a2b621 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(fuse_conn_init);
void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ if (fiq->ops->release)
+ fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
put_user_ns(fc->user_ns);
fc->release(fc);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
new file mode 100644
index 000000000000..6af3f131e468
--- /dev/null
+++ b/fs/fuse/virtio_fs.c
@@ -0,0 +1,1195 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio-fs: Virtio Filesystem
+ * Copyright (C) 2018 Red Hat, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_fs.h>
+#include <linux/delay.h>
+#include <linux/fs_context.h>
+#include <linux/highmem.h>
+#include "fuse_i.h"
+
+/* List of virtio-fs device instances and a lock for the list. Also provides
+ * mutual exclusion in device removal and mounting path
+ */
+static DEFINE_MUTEX(virtio_fs_mutex);
+static LIST_HEAD(virtio_fs_instances);
+
+enum {
+ VQ_HIPRIO,
+ VQ_REQUEST
+};
+
+/* Per-virtqueue state */
+struct virtio_fs_vq {
+ spinlock_t lock;
+ struct virtqueue *vq; /* protected by ->lock */
+ struct work_struct done_work;
+ struct list_head queued_reqs;
+ struct delayed_work dispatch_work;
+ struct fuse_dev *fud;
+ bool connected;
+ long in_flight;
+ char name[24];
+} ____cacheline_aligned_in_smp;
+
+/* A virtio-fs device instance */
+struct virtio_fs {
+ struct kref refcount;
+ struct list_head list; /* on virtio_fs_instances */
+ char *tag;
+ struct virtio_fs_vq *vqs;
+ unsigned int nvqs; /* number of virtqueues */
+ unsigned int num_request_queues; /* number of request queues */
+};
+
+struct virtio_fs_forget {
+ struct fuse_in_header ih;
+ struct fuse_forget_in arg;
+ /* This request can be temporarily queued on virt queue */
+ struct list_head list;
+};
+
+static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
+{
+ struct virtio_fs *fs = vq->vdev->priv;
+
+ return &fs->vqs[vq->index];
+}
+
+static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
+{
+ return &vq_to_fsvq(vq)->fud->pq;
+}
+
+static void release_virtio_fs_obj(struct kref *ref)
+{
+ struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
+
+ kfree(vfs->vqs);
+ kfree(vfs);
+}
+
+/* Make sure virtiofs_mutex is held */
+static void virtio_fs_put(struct virtio_fs *fs)
+{
+ kref_put(&fs->refcount, release_virtio_fs_obj);
+}
+
+static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
+{
+ struct virtio_fs *vfs = fiq->priv;
+
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(vfs);
+ mutex_unlock(&virtio_fs_mutex);
+}
+
+static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
+{
+ WARN_ON(fsvq->in_flight < 0);
+
+ /* Wait for in flight requests to finish.*/
+ while (1) {
+ spin_lock(&fsvq->lock);
+ if (!fsvq->in_flight) {
+ spin_unlock(&fsvq->lock);
+ break;
+ }
+ spin_unlock(&fsvq->lock);
+ /* TODO use completion instead of timeout */
+ usleep_range(1000, 2000);
+ }
+
+ flush_work(&fsvq->done_work);
+ flush_delayed_work(&fsvq->dispatch_work);
+}
+
+static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq)
+{
+ struct virtio_fs_forget *forget;
+
+ spin_lock(&fsvq->lock);
+ while (1) {
+ forget = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct virtio_fs_forget, list);
+ if (!forget)
+ break;
+ list_del(&forget->list);
+ kfree(forget);
+ }
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ if (i == VQ_HIPRIO)
+ drain_hiprio_queued_reqs(fsvq);
+
+ virtio_fs_drain_queue(fsvq);
+ }
+}
+
+static void virtio_fs_start_all_queues(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ spin_lock(&fsvq->lock);
+ fsvq->connected = true;
+ spin_unlock(&fsvq->lock);
+ }
+}
+
+/* Add a new instance to the list or return -EEXIST if tag name exists*/
+static int virtio_fs_add_instance(struct virtio_fs *fs)
+{
+ struct virtio_fs *fs2;
+ bool duplicate = false;
+
+ mutex_lock(&virtio_fs_mutex);
+
+ list_for_each_entry(fs2, &virtio_fs_instances, list) {
+ if (strcmp(fs->tag, fs2->tag) == 0)
+ duplicate = true;
+ }
+
+ if (!duplicate)
+ list_add_tail(&fs->list, &virtio_fs_instances);
+
+ mutex_unlock(&virtio_fs_mutex);
+
+ if (duplicate)
+ return -EEXIST;
+ return 0;
+}
+
+/* Return the virtio_fs with a given tag, or NULL */
+static struct virtio_fs *virtio_fs_find_instance(const char *tag)
+{
+ struct virtio_fs *fs;
+
+ mutex_lock(&virtio_fs_mutex);
+
+ list_for_each_entry(fs, &virtio_fs_instances, list) {
+ if (strcmp(fs->tag, tag) == 0) {
+ kref_get(&fs->refcount);
+ goto found;
+ }
+ }
+
+ fs = NULL; /* not found */
+
+found:
+ mutex_unlock(&virtio_fs_mutex);
+
+ return fs;
+}
+
+static void virtio_fs_free_devs(struct virtio_fs *fs)
+{
+ unsigned int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ if (!fsvq->fud)
+ continue;
+
+ fuse_dev_free(fsvq->fud);
+ fsvq->fud = NULL;
+ }
+}
+
+/* Read filesystem name from virtio config into fs->tag (must kfree()). */
+static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+ char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
+ char *end;
+ size_t len;
+
+ virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
+ &tag_buf, sizeof(tag_buf));
+ end = memchr(tag_buf, '\0', sizeof(tag_buf));
+ if (end == tag_buf)
+ return -EINVAL; /* empty tag */
+ if (!end)
+ end = &tag_buf[sizeof(tag_buf)];
+
+ len = end - tag_buf;
+ fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
+ if (!fs->tag)
+ return -ENOMEM;
+ memcpy(fs->tag, tag_buf, len);
+ fs->tag[len] = '\0';
+ return 0;
+}
+
+/* Work function for hiprio completion */
+static void virtio_fs_hiprio_done_work(struct work_struct *work)
+{
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ done_work);
+ struct virtqueue *vq = fsvq->vq;
+
+ /* Free completed FUSE_FORGET requests */
+ spin_lock(&fsvq->lock);
+ do {
+ unsigned int len;
+ void *req;
+
+ virtqueue_disable_cb(vq);
+
+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+ kfree(req);
+ fsvq->in_flight--;
+ }
+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
+{
+}
+
+static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
+{
+ struct virtio_fs_forget *forget;
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ dispatch_work.work);
+ struct virtqueue *vq = fsvq->vq;
+ struct scatterlist sg;
+ struct scatterlist *sgs[] = {&sg};
+ bool notify;
+ int ret;
+
+ pr_debug("virtio-fs: worker %s called.\n", __func__);
+ while (1) {
+ spin_lock(&fsvq->lock);
+ forget = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct virtio_fs_forget, list);
+ if (!forget) {
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+
+ list_del(&forget->list);
+ if (!fsvq->connected) {
+ spin_unlock(&fsvq->lock);
+ kfree(forget);
+ continue;
+ }
+
+ sg_init_one(&sg, forget, sizeof(*forget));
+
+ /* Enqueue the request */
+ dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
+ ret);
+ list_add_tail(&forget->list,
+ &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ } else {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+ ret);
+ kfree(forget);
+ }
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+
+ fsvq->in_flight++;
+ notify = virtqueue_kick_prepare(vq);
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+ pr_debug("virtio-fs: worker %s dispatched one forget request.\n",
+ __func__);
+ }
+}
+
+/* Allocate and copy args into req->argbuf */
+static int copy_args_to_argbuf(struct fuse_req *req)
+{
+ struct fuse_args *args = req->args;
+ unsigned int offset = 0;
+ unsigned int num_in;
+ unsigned int num_out;
+ unsigned int len;
+ unsigned int i;
+
+ num_in = args->in_numargs - args->in_pages;
+ num_out = args->out_numargs - args->out_pages;
+ len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
+ fuse_len_args(num_out, args->out_args);
+
+ req->argbuf = kmalloc(len, GFP_ATOMIC);
+ if (!req->argbuf)
+ return -ENOMEM;
+
+ for (i = 0; i < num_in; i++) {
+ memcpy(req->argbuf + offset,
+ args->in_args[i].value,
+ args->in_args[i].size);
+ offset += args->in_args[i].size;
+ }
+
+ return 0;
+}
+
+/* Copy args out of and free req->argbuf */
+static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
+{
+ unsigned int remaining;
+ unsigned int offset;
+ unsigned int num_in;
+ unsigned int num_out;
+ unsigned int i;
+
+ remaining = req->out.h.len - sizeof(req->out.h);
+ num_in = args->in_numargs - args->in_pages;
+ num_out = args->out_numargs - args->out_pages;
+ offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args);
+
+ for (i = 0; i < num_out; i++) {
+ unsigned int argsize = args->out_args[i].size;
+
+ if (args->out_argvar &&
+ i == args->out_numargs - 1 &&
+ argsize > remaining) {
+ argsize = remaining;
+ }
+
+ memcpy(args->out_args[i].value, req->argbuf + offset, argsize);
+ offset += argsize;
+
+ if (i != args->out_numargs - 1)
+ remaining -= argsize;
+ }
+
+ /* Store the actual size of the variable-length arg */
+ if (args->out_argvar)
+ args->out_args[args->out_numargs - 1].size = remaining;
+
+ kfree(req->argbuf);
+ req->argbuf = NULL;
+}
+
+/* Work function for request completion */
+static void virtio_fs_requests_done_work(struct work_struct *work)
+{
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ done_work);
+ struct fuse_pqueue *fpq = &fsvq->fud->pq;
+ struct fuse_conn *fc = fsvq->fud->fc;
+ struct virtqueue *vq = fsvq->vq;
+ struct fuse_req *req;
+ struct fuse_args_pages *ap;
+ struct fuse_req *next;
+ struct fuse_args *args;
+ unsigned int len, i, thislen;
+ struct page *page;
+ LIST_HEAD(reqs);
+
+ /* Collect completed requests off the virtqueue */
+ spin_lock(&fsvq->lock);
+ do {
+ virtqueue_disable_cb(vq);
+
+ while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+ spin_lock(&fpq->lock);
+ list_move_tail(&req->list, &reqs);
+ spin_unlock(&fpq->lock);
+ }
+ } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+ spin_unlock(&fsvq->lock);
+
+ /* End requests */
+ list_for_each_entry_safe(req, next, &reqs, list) {
+ /*
+ * TODO verify that server properly follows FUSE protocol
+ * (oh.uniq, oh.len)
+ */
+ args = req->args;
+ copy_args_from_argbuf(args, req);
+
+ if (args->out_pages && args->page_zeroing) {
+ len = args->out_args[args->out_numargs - 1].size;
+ ap = container_of(args, typeof(*ap), args);
+ for (i = 0; i < ap->num_pages; i++) {
+ thislen = ap->descs[i].length;
+ if (len < thislen) {
+ WARN_ON(ap->descs[i].offset);
+ page = ap->pages[i];
+ zero_user_segment(page, len, thislen);
+ len = 0;
+ } else {
+ len -= thislen;
+ }
+ }
+ }
+
+ spin_lock(&fpq->lock);
+ clear_bit(FR_SENT, &req->flags);
+ list_del_init(&req->list);
+ spin_unlock(&fpq->lock);
+
+ fuse_request_end(fc, req);
+ spin_lock(&fsvq->lock);
+ fsvq->in_flight--;
+ spin_unlock(&fsvq->lock);
+ }
+}
+
+/* Virtqueue interrupt handler */
+static void virtio_fs_vq_done(struct virtqueue *vq)
+{
+ struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
+
+ dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
+
+ schedule_work(&fsvq->done_work);
+}
+
+/* Initialize virtqueues */
+static int virtio_fs_setup_vqs(struct virtio_device *vdev,
+ struct virtio_fs *fs)
+{
+ struct virtqueue **vqs;
+ vq_callback_t **callbacks;
+ const char **names;
+ unsigned int i;
+ int ret = 0;
+
+ virtio_cread(vdev, struct virtio_fs_config, num_request_queues,
+ &fs->num_request_queues);
+ if (fs->num_request_queues == 0)
+ return -EINVAL;
+
+ fs->nvqs = 1 + fs->num_request_queues;
+ fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
+ if (!fs->vqs)
+ return -ENOMEM;
+
+ vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
+ callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
+ GFP_KERNEL);
+ names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
+ if (!vqs || !callbacks || !names) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
+ snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
+ "hiprio");
+ names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
+ INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
+ INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
+ INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
+ spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
+
+ /* Initialize the requests virtqueues */
+ for (i = VQ_REQUEST; i < fs->nvqs; i++) {
+ spin_lock_init(&fs->vqs[i].lock);
+ INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
+ INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
+ virtio_fs_dummy_dispatch_work);
+ INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
+ snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
+ "requests.%u", i - VQ_REQUEST);
+ callbacks[i] = virtio_fs_vq_done;
+ names[i] = fs->vqs[i].name;
+ }
+
+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < fs->nvqs; i++)
+ fs->vqs[i].vq = vqs[i];
+
+ virtio_fs_start_all_queues(fs);
+out:
+ kfree(names);
+ kfree(callbacks);
+ kfree(vqs);
+ if (ret)
+ kfree(fs->vqs);
+ return ret;
+}
+
+/* Free virtqueues (device must already be reset) */
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
+ struct virtio_fs *fs)
+{
+ vdev->config->del_vqs(vdev);
+}
+
+static int virtio_fs_probe(struct virtio_device *vdev)
+{
+ struct virtio_fs *fs;
+ int ret;
+
+ fs = kzalloc(sizeof(*fs), GFP_KERNEL);
+ if (!fs)
+ return -ENOMEM;
+ kref_init(&fs->refcount);
+ vdev->priv = fs;
+
+ ret = virtio_fs_read_tag(vdev, fs);
+ if (ret < 0)
+ goto out;
+
+ ret = virtio_fs_setup_vqs(vdev, fs);
+ if (ret < 0)
+ goto out;
+
+ /* TODO vq affinity */
+
+ /* Bring the device online in case the filesystem is mounted and
+ * requests need to be sent before we return.
+ */
+ virtio_device_ready(vdev);
+
+ ret = virtio_fs_add_instance(fs);
+ if (ret < 0)
+ goto out_vqs;
+
+ return 0;
+
+out_vqs:
+ vdev->config->reset(vdev);
+ virtio_fs_cleanup_vqs(vdev, fs);
+
+out:
+ vdev->priv = NULL;
+ kfree(fs);
+ return ret;
+}
+
+static void virtio_fs_stop_all_queues(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ spin_lock(&fsvq->lock);
+ fsvq->connected = false;
+ spin_unlock(&fsvq->lock);
+ }
+}
+
+static void virtio_fs_remove(struct virtio_device *vdev)
+{
+ struct virtio_fs *fs = vdev->priv;
+
+ mutex_lock(&virtio_fs_mutex);
+ /* This device is going away. No one should get new reference */
+ list_del_init(&fs->list);
+ virtio_fs_stop_all_queues(fs);
+ virtio_fs_drain_all_queues(fs);
+ vdev->config->reset(vdev);
+ virtio_fs_cleanup_vqs(vdev, fs);
+
+ vdev->priv = NULL;
+ /* Put device reference on virtio_fs object */
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_fs_freeze(struct virtio_device *vdev)
+{
+ /* TODO need to save state here */
+ pr_warn("virtio-fs: suspend/resume not yet supported\n");
+ return -EOPNOTSUPP;
+}
+
+static int virtio_fs_restore(struct virtio_device *vdev)
+{
+ /* TODO need to restore state here */
+ return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+const static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
+ {},
+};
+
+const static unsigned int feature_table[] = {};
+
+static struct virtio_driver virtio_fs_driver = {
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .feature_table = feature_table,
+ .feature_table_size = ARRAY_SIZE(feature_table),
+ .probe = virtio_fs_probe,
+ .remove = virtio_fs_remove,
+#ifdef CONFIG_PM_SLEEP
+ .freeze = virtio_fs_freeze,
+ .restore = virtio_fs_restore,
+#endif
+};
+
+static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+ struct fuse_forget_link *link;
+ struct virtio_fs_forget *forget;
+ struct scatterlist sg;
+ struct scatterlist *sgs[] = {&sg};
+ struct virtio_fs *fs;
+ struct virtqueue *vq;
+ struct virtio_fs_vq *fsvq;
+ bool notify;
+ u64 unique;
+ int ret;
+
+ link = fuse_dequeue_forget(fiq, 1, NULL);
+ unique = fuse_get_unique(fiq);
+
+ fs = fiq->priv;
+ fsvq = &fs->vqs[VQ_HIPRIO];
+ spin_unlock(&fiq->lock);
+
+ /* Allocate a buffer for the request */
+ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
+
+ forget->ih = (struct fuse_in_header){
+ .opcode = FUSE_FORGET,
+ .nodeid = link->forget_one.nodeid,
+ .unique = unique,
+ .len = sizeof(*forget),
+ };
+ forget->arg = (struct fuse_forget_in){
+ .nlookup = link->forget_one.nlookup,
+ };
+
+ sg_init_one(&sg, forget, sizeof(*forget));
+
+ /* Enqueue the request */
+ spin_lock(&fsvq->lock);
+
+ if (!fsvq->connected) {
+ kfree(forget);
+ spin_unlock(&fsvq->lock);
+ goto out;
+ }
+
+ vq = fsvq->vq;
+ dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+
+ ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n",
+ ret);
+ list_add_tail(&forget->list, &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ } else {
+ pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+ ret);
+ kfree(forget);
+ }
+ spin_unlock(&fsvq->lock);
+ goto out;
+ }
+
+ fsvq->in_flight++;
+ notify = virtqueue_kick_prepare(vq);
+
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+out:
+ kfree(link);
+}
+
+static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+ /*
+ * TODO interrupts.
+ *
+ * Normal fs operations on a local filesystems aren't interruptible.
+ * Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
+ * with shared lock between host and guest.
+ */
+ spin_unlock(&fiq->lock);
+}
+
+/* Return the number of scatter-gather list elements required */
+static unsigned int sg_count_fuse_req(struct fuse_req *req)
+{
+ struct fuse_args *args = req->args;
+ struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
+ unsigned int total_sgs = 1 /* fuse_in_header */;
+
+ if (args->in_numargs - args->in_pages)
+ total_sgs += 1;
+
+ if (args->in_pages)
+ total_sgs += ap->num_pages;
+
+ if (!test_bit(FR_ISREPLY, &req->flags))
+ return total_sgs;
+
+ total_sgs += 1 /* fuse_out_header */;
+
+ if (args->out_numargs - args->out_pages)
+ total_sgs += 1;
+
+ if (args->out_pages)
+ total_sgs += ap->num_pages;
+
+ return total_sgs;
+}
+
+/* Add pages to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
+ struct page **pages,
+ struct fuse_page_desc *page_descs,
+ unsigned int num_pages,
+ unsigned int total_len)
+{
+ unsigned int i;
+ unsigned int this_len;
+
+ for (i = 0; i < num_pages && total_len; i++) {
+ sg_init_table(&sg[i], 1);
+ this_len = min(page_descs[i].length, total_len);
+ sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
+ total_len -= this_len;
+ }
+
+ return i;
+}
+
+/* Add args to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_args(struct scatterlist *sg,
+ struct fuse_req *req,
+ struct fuse_arg *args,
+ unsigned int numargs,
+ bool argpages,
+ void *argbuf,
+ unsigned int *len_used)
+{
+ struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
+ unsigned int total_sgs = 0;
+ unsigned int len;
+
+ len = fuse_len_args(numargs - argpages, args);
+ if (len)
+ sg_init_one(&sg[total_sgs++], argbuf, len);
+
+ if (argpages)
+ total_sgs += sg_init_fuse_pages(&sg[total_sgs],
+ ap->pages, ap->descs,
+ ap->num_pages,
+ args[numargs - 1].size);
+
+ if (len_used)
+ *len_used = len;
+
+ return total_sgs;
+}
+
+/* Add a request to a virtqueue and kick the device */
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+ struct fuse_req *req)
+{
+ /* requests need at least 4 elements */
+ struct scatterlist *stack_sgs[6];
+ struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
+ struct scatterlist **sgs = stack_sgs;
+ struct scatterlist *sg = stack_sg;
+ struct virtqueue *vq;
+ struct fuse_args *args = req->args;
+ unsigned int argbuf_used = 0;
+ unsigned int out_sgs = 0;
+ unsigned int in_sgs = 0;
+ unsigned int total_sgs;
+ unsigned int i;
+ int ret;
+ bool notify;
+
+ /* Does the sglist fit on the stack? */
+ total_sgs = sg_count_fuse_req(req);
+ if (total_sgs > ARRAY_SIZE(stack_sgs)) {
+ sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
+ sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
+ if (!sgs || !sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* Use a bounce buffer since stack args cannot be mapped */
+ ret = copy_args_to_argbuf(req);
+ if (ret < 0)
+ goto out;
+
+ /* Request elements */
+ sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
+ out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
+ (struct fuse_arg *)args->in_args,
+ args->in_numargs, args->in_pages,
+ req->argbuf, &argbuf_used);
+
+ /* Reply elements */
+ if (test_bit(FR_ISREPLY, &req->flags)) {
+ sg_init_one(&sg[out_sgs + in_sgs++],
+ &req->out.h, sizeof(req->out.h));
+ in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
+ args->out_args, args->out_numargs,
+ args->out_pages,
+ req->argbuf + argbuf_used, NULL);
+ }
+
+ WARN_ON(out_sgs + in_sgs != total_sgs);
+
+ for (i = 0; i < total_sgs; i++)
+ sgs[i] = &sg[i];
+
+ spin_lock(&fsvq->lock);
+
+ if (!fsvq->connected) {
+ spin_unlock(&fsvq->lock);
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ vq = fsvq->vq;
+ ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
+ if (ret < 0) {
+ spin_unlock(&fsvq->lock);
+ goto out;
+ }
+
+ fsvq->in_flight++;
+ notify = virtqueue_kick_prepare(vq);
+
+ spin_unlock(&fsvq->lock);
+
+ if (notify)
+ virtqueue_notify(vq);
+
+out:
+ if (ret < 0 && req->argbuf) {
+ kfree(req->argbuf);
+ req->argbuf = NULL;
+ }
+ if (sgs != stack_sgs) {
+ kfree(sgs);
+ kfree(sg);
+ }
+
+ return ret;
+}
+
+static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+ unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+ struct virtio_fs *fs;
+ struct fuse_conn *fc;
+ struct fuse_req *req;
+ struct fuse_pqueue *fpq;
+ int ret;
+
+ WARN_ON(list_empty(&fiq->pending));
+ req = list_last_entry(&fiq->pending, struct fuse_req, list);
+ clear_bit(FR_PENDING, &req->flags);
+ list_del_init(&req->list);
+ WARN_ON(!list_empty(&fiq->pending));
+ spin_unlock(&fiq->lock);
+
+ fs = fiq->priv;
+ fc = fs->vqs[queue_id].fud->fc;
+
+ pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
+ __func__, req->in.h.opcode, req->in.h.unique,
+ req->in.h.nodeid, req->in.h.len,
+ fuse_len_args(req->args->out_numargs, req->args->out_args));
+
+ fpq = &fs->vqs[queue_id].fud->pq;
+ spin_lock(&fpq->lock);
+ if (!fpq->connected) {
+ spin_unlock(&fpq->lock);
+ req->out.h.error = -ENODEV;
+ pr_err("virtio-fs: %s disconnected\n", __func__);
+ fuse_request_end(fc, req);
+ return;
+ }
+ list_add_tail(&req->list, fpq->processing);
+ spin_unlock(&fpq->lock);
+ set_bit(FR_SENT, &req->flags);
+ /* matches barrier in request_wait_answer() */
+ smp_mb__after_atomic();
+
+retry:
+ ret = virtio_fs_enqueue_req(&fs->vqs[queue_id], req);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ /* Virtqueue full. Retry submission */
+ /* TODO use completion instead of timeout */
+ usleep_range(20, 30);
+ goto retry;
+ }
+ req->out.h.error = ret;
+ pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
+ spin_lock(&fpq->lock);
+ clear_bit(FR_SENT, &req->flags);
+ list_del_init(&req->list);
+ spin_unlock(&fpq->lock);
+ fuse_request_end(fc, req);
+ return;
+ }
+}
+
+const static struct fuse_iqueue_ops virtio_fs_fiq_ops = {
+ .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock,
+ .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock,
+ .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock,
+ .release = virtio_fs_fiq_release,
+};
+
+static int virtio_fs_fill_super(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct virtio_fs *fs = fc->iq.priv;
+ unsigned int i;
+ int err;
+ struct fuse_fs_context ctx = {
+ .rootmode = S_IFDIR,
+ .default_permissions = 1,
+ .allow_other = 1,
+ .max_read = UINT_MAX,
+ .blksize = 512,
+ .destroy = true,
+ .no_control = true,
+ .no_force_umount = true,
+ };
+
+ mutex_lock(&virtio_fs_mutex);
+
+ /* After holding mutex, make sure virtiofs device is still there.
+ * Though we are holding a reference to it, drive ->remove might
+ * still have cleaned up virtual queues. In that case bail out.
+ */
+ err = -EINVAL;
+ if (list_empty(&fs->list)) {
+ pr_info("virtio-fs: tag <%s> not found\n", fs->tag);
+ goto err;
+ }
+
+ err = -ENOMEM;
+ /* Allocate fuse_dev for hiprio and notification queues */
+ for (i = 0; i < VQ_REQUEST; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ fsvq->fud = fuse_dev_alloc();
+ if (!fsvq->fud)
+ goto err_free_fuse_devs;
+ }
+
+ ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
+ err = fuse_fill_super_common(sb, &ctx);
+ if (err < 0)
+ goto err_free_fuse_devs;
+
+ fc = fs->vqs[VQ_REQUEST].fud->fc;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+ if (i == VQ_REQUEST)
+ continue; /* already initialized */
+ fuse_dev_install(fsvq->fud, fc);
+ }
+
+ /* Previous unmount will stop all queues. Start these again */
+ virtio_fs_start_all_queues(fs);
+ fuse_send_init(fc);
+ mutex_unlock(&virtio_fs_mutex);
+ return 0;
+
+err_free_fuse_devs:
+ virtio_fs_free_devs(fs);
+err:
+ mutex_unlock(&virtio_fs_mutex);
+ return err;
+}
+
+static void virtio_kill_sb(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct virtio_fs *vfs;
+ struct virtio_fs_vq *fsvq;
+
+ /* If mount failed, we can still be called without any fc */
+ if (!fc)
+ return fuse_kill_sb_anon(sb);
+
+ vfs = fc->iq.priv;
+ fsvq = &vfs->vqs[VQ_HIPRIO];
+
+ /* Stop forget queue. Soon destroy will be sent */
+ spin_lock(&fsvq->lock);
+ fsvq->connected = false;
+ spin_unlock(&fsvq->lock);
+ virtio_fs_drain_all_queues(vfs);
+
+ fuse_kill_sb_anon(sb);
+
+ /* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+ * and drain one more time and free fuse devices. Freeing fuse
+ * devices will drop their reference on fuse_conn and that in
+ * turn will drop its reference on virtio_fs object.
+ */
+ virtio_fs_stop_all_queues(vfs);
+ virtio_fs_drain_all_queues(vfs);
+ virtio_fs_free_devs(vfs);
+}
+
+static int virtio_fs_test_super(struct super_block *sb,
+ struct fs_context *fsc)
+{
+ struct fuse_conn *fc = fsc->s_fs_info;
+
+ return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+}
+
+static int virtio_fs_set_super(struct super_block *sb,
+ struct fs_context *fsc)
+{
+ int err;
+
+ err = get_anon_bdev(&sb->s_dev);
+ if (!err)
+ fuse_conn_get(fsc->s_fs_info);
+
+ return err;
+}
+
+static int virtio_fs_get_tree(struct fs_context *fsc)
+{
+ struct virtio_fs *fs;
+ struct super_block *sb;
+ struct fuse_conn *fc;
+ int err;
+
+ /* This gets a reference on virtio_fs object. This ptr gets installed
+ * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
+ * to drop the reference to this object.
+ */
+ fs = virtio_fs_find_instance(fsc->source);
+ if (!fs) {
+ pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
+ return -EINVAL;
+ }
+
+ fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
+ if (!fc) {
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+ return -ENOMEM;
+ }
+
+ fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
+ fs);
+ fc->release = fuse_free_conn;
+ fc->delete_stale = true;
+
+ fsc->s_fs_info = fc;
+ sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
+ fuse_conn_put(fc);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ if (!sb->s_root) {
+ err = virtio_fs_fill_super(sb);
+ if (err) {
+ deactivate_locked_super(sb);
+ return err;
+ }
+
+ sb->s_flags |= SB_ACTIVE;
+ }
+
+ WARN_ON(fsc->root);
+ fsc->root = dget(sb->s_root);
+ return 0;
+}
+
+static const struct fs_context_operations virtio_fs_context_ops = {
+ .get_tree = virtio_fs_get_tree,
+};
+
+static int virtio_fs_init_fs_context(struct fs_context *fsc)
+{
+ fsc->ops = &virtio_fs_context_ops;
+ return 0;
+}
+
+static struct file_system_type virtio_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "virtiofs",
+ .init_fs_context = virtio_fs_init_fs_context,
+ .kill_sb = virtio_kill_sb,
+};
+
+static int __init virtio_fs_init(void)
+{
+ int ret;
+
+ ret = register_virtio_driver(&virtio_fs_driver);
+ if (ret < 0)
+ return ret;
+
+ ret = register_filesystem(&virtio_fs_type);
+ if (ret < 0) {
+ unregister_virtio_driver(&virtio_fs_driver);
+ return ret;
+ }
+
+ return 0;
+}
+module_init(virtio_fs_init);
+
+static void __exit virtio_fs_exit(void)
+{
+ unregister_filesystem(&virtio_fs_type);
+ unregister_virtio_driver(&virtio_fs_driver);
+}
+module_exit(virtio_fs_exit);
+
+MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>");
+MODULE_DESCRIPTION("Virtio Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS(KBUILD_MODNAME);
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1d03afd74368..67dbe0201e0d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2799,6 +2799,38 @@ out:
return submit;
}
+struct io_wait_queue {
+ struct wait_queue_entry wq;
+ struct io_ring_ctx *ctx;
+ unsigned to_wait;
+ unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+ struct io_ring_ctx *ctx = iowq->ctx;
+
+ /*
+ * Wake up if we have enough events, or if a timeout occured since we
+ * started waiting. For timeouts, we always want to return to userspace,
+ * regardless of event count.
+ */
+ return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+ atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+ int wake_flags, void *key)
+{
+ struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+ wq);
+
+ if (!io_should_wake(iowq))
+ return -1;
+
+ return autoremove_wake_function(curr, mode, wake_flags, key);
+}
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -2806,8 +2838,16 @@ out:
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
{
+ struct io_wait_queue iowq = {
+ .wq = {
+ .private = current,
+ .func = io_wake_function,
+ .entry = LIST_HEAD_INIT(iowq.wq.entry),
+ },
+ .ctx = ctx,
+ .to_wait = min_events,
+ };
struct io_rings *rings = ctx->rings;
- unsigned nr_timeouts;
int ret;
if (io_cqring_events(rings) >= min_events)
@@ -2826,15 +2866,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- nr_timeouts = atomic_read(&ctx->cq_timeouts);
- /*
- * Return if we have enough events, or if a timeout occured since
- * we started waiting. For timeouts, we always want to return to
- * userspace.
- */
- ret = wait_event_interruptible(ctx->wait,
- io_cqring_events(rings) >= min_events ||
- atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+ ret = 0;
+ iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+ do {
+ prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+ TASK_INTERRUPTIBLE);
+ if (io_should_wake(&iowq))
+ break;
+ schedule();
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ } while (1);
+ finish_wait(&ctx->wait, &iowq.wq);
+
restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -3490,7 +3536,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
ctx->rings->sq_ring_entries)
mask |= EPOLLOUT | EPOLLWRNORM;
- if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail)
+ if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index cbe70637c117..0e6406c4f362 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -163,13 +163,11 @@ static const struct export_operations jffs2_export_ops = {
* Opt_rp_size: size of reserved pool in KiB
*/
enum {
- Opt_source,
Opt_override_compr,
Opt_rp_size,
};
static const struct fs_parameter_spec jffs2_param_specs[] = {
- fsparam_string ("source", Opt_source),
fsparam_enum ("compr", Opt_override_compr),
fsparam_u32 ("rp_size", Opt_rp_size),
{}
diff --git a/fs/libfs.c b/fs/libfs.c
index c9b2850c0f7c..1463b038ffc4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,58 +89,45 @@ int dcache_dir_close(struct inode *inode, struct file *file)
EXPORT_SYMBOL(dcache_dir_close);
/* parent is locked at least shared */
-static struct dentry *next_positive(struct dentry *parent,
- struct list_head *from,
- int count)
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and returned to caller.
+ * If no such element exists, NULL is returned.
+ */
+static struct dentry *scan_positives(struct dentry *cursor,
+ struct list_head *p,
+ loff_t count,
+ struct dentry *last)
{
- unsigned *seq = &parent->d_inode->i_dir_seq, n;
- struct dentry *res;
- struct list_head *p;
- bool skipped;
- int i;
+ struct dentry *dentry = cursor->d_parent, *found = NULL;
-retry:
- i = count;
- skipped = false;
- n = smp_load_acquire(seq) & ~1;
- res = NULL;
- rcu_read_lock();
- for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+ spin_lock(&dentry->d_lock);
+ while ((p = p->next) != &dentry->d_subdirs) {
struct dentry *d = list_entry(p, struct dentry, d_child);
- if (!simple_positive(d)) {
- skipped = true;
- } else if (!--i) {
- res = d;
- break;
+ // we must at least skip cursors, to avoid livelocks
+ if (d->d_flags & DCACHE_DENTRY_CURSOR)
+ continue;
+ if (simple_positive(d) && !--count) {
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(d))
+ found = dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ if (likely(found))
+ break;
+ count = 1;
+ }
+ if (need_resched()) {
+ list_move(&cursor->d_child, p);
+ p = &cursor->d_child;
+ spin_unlock(&dentry->d_lock);
+ cond_resched();
+ spin_lock(&dentry->d_lock);
}
}
- rcu_read_unlock();
- if (skipped) {
- smp_rmb();
- if (unlikely(*seq != n))
- goto retry;
- }
- return res;
-}
-
-static void move_cursor(struct dentry *cursor, struct list_head *after)
-{
- struct dentry *parent = cursor->d_parent;
- unsigned n, *seq = &parent->d_inode->i_dir_seq;
- spin_lock(&parent->d_lock);
- for (;;) {
- n = *seq;
- if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
- break;
- cpu_relax();
- }
- __list_del(cursor->d_child.prev, cursor->d_child.next);
- if (after)
- list_add(&cursor->d_child, after);
- else
- list_add_tail(&cursor->d_child, &parent->d_subdirs);
- smp_store_release(seq, n + 2);
- spin_unlock(&parent->d_lock);
+ spin_unlock(&dentry->d_lock);
+ dput(last);
+ return found;
}
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
@@ -158,17 +145,25 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
if (offset != file->f_pos) {
+ struct dentry *cursor = file->private_data;
+ struct dentry *to = NULL;
+
+ inode_lock_shared(dentry->d_inode);
+
+ if (offset > 2)
+ to = scan_positives(cursor, &dentry->d_subdirs,
+ offset - 2, NULL);
+ spin_lock(&dentry->d_lock);
+ if (to)
+ list_move(&cursor->d_child, &to->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(to);
+
file->f_pos = offset;
- if (file->f_pos >= 2) {
- struct dentry *cursor = file->private_data;
- struct dentry *to;
- loff_t n = file->f_pos - 2;
-
- inode_lock_shared(dentry->d_inode);
- to = next_positive(dentry, &dentry->d_subdirs, n);
- move_cursor(cursor, to ? &to->d_child : NULL);
- inode_unlock_shared(dentry->d_inode);
- }
+
+ inode_unlock_shared(dentry->d_inode);
}
return offset;
}
@@ -190,25 +185,35 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *p = &cursor->d_child;
- struct dentry *next;
- bool moved = false;
+ struct list_head *anchor = &dentry->d_subdirs;
+ struct dentry *next = NULL;
+ struct list_head *p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
- p = &dentry->d_subdirs;
- while ((next = next_positive(dentry, p, 1)) != NULL) {
+ p = anchor;
+ else if (!list_empty(&cursor->d_child))
+ p = &cursor->d_child;
+ else
+ return 0;
+
+ while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
d_inode(next)->i_ino, dt_type(d_inode(next))))
break;
- moved = true;
- p = &next->d_child;
ctx->pos++;
+ p = &next->d_child;
}
- if (moved)
- move_cursor(cursor, p);
+ spin_lock(&dentry->d_lock);
+ if (next)
+ list_move_tail(&cursor->d_child, &next->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(next);
+
return 0;
}
EXPORT_SYMBOL(dcache_readdir);
@@ -468,8 +473,7 @@ EXPORT_SYMBOL(simple_write_begin);
/**
* simple_write_end - .write_end helper for non-block-device FSes
- * @available: See .write_end of address_space_operations
- * @file: "
+ * @file: See .write_end of address_space_operations
* @mapping: "
* @pos: "
* @len: "
diff --git a/fs/locks.c b/fs/locks.c
index a364ebc5cec3..6970f55daf54 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -212,6 +212,7 @@ struct file_lock_list_struct {
static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);
+
/*
* The blocked_hash is used to find POSIX lock loops for deadlock detection.
* It is protected by blocked_lock_lock.
@@ -1991,6 +1992,64 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
}
EXPORT_SYMBOL(generic_setlease);
+#if IS_ENABLED(CONFIG_SRCU)
+/*
+ * Kernel subsystems can register to be notified on any attempt to set
+ * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
+ * to close files that it may have cached when there is an attempt to set a
+ * conflicting lease.
+ */
+static struct srcu_notifier_head lease_notifier_chain;
+
+static inline void
+lease_notifier_chain_init(void)
+{
+ srcu_init_notifier_head(&lease_notifier_chain);
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+ if (arg != F_UNLCK)
+ srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+ return srcu_notifier_chain_register(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+ srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#else /* !IS_ENABLED(CONFIG_SRCU) */
+static inline void
+lease_notifier_chain_init(void)
+{
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#endif /* IS_ENABLED(CONFIG_SRCU) */
+
/**
* vfs_setlease - sets a lease on an open file
* @filp: file pointer
@@ -2011,6 +2070,8 @@ EXPORT_SYMBOL(generic_setlease);
int
vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
{
+ if (lease)
+ setlease_notifier(arg, *lease);
if (filp->f_op->setlease)
return filp->f_op->setlease(filp, arg, lease, priv);
else
@@ -2924,6 +2985,7 @@ static int __init filelock_init(void)
INIT_HLIST_HEAD(&fll->hlist);
}
+ lease_notifier_chain_init();
return 0;
}
core_initcall(filelock_init);
diff --git a/fs/namespace.c b/fs/namespace.c
index b75d458f817d..fe0e9e1410fe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3026,7 +3026,7 @@ void *copy_mount_options(const void __user * data)
* the remainder of the page.
*/
/* copy_from_user cannot cross TASK_SIZE ! */
- size = TASK_SIZE - (unsigned long)data;
+ size = TASK_SIZE - (unsigned long)untagged_addr(data);
if (size > PAGE_SIZE)
size = PAGE_SIZE;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0adfd8840110..e180033e35cf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1669,10 +1669,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
#endif /* CONFIG_NFSV4 */
-/*
- * Code common to create, mkdir, and mknod.
- */
-int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+struct dentry *
+nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
struct nfs_fattr *fattr,
struct nfs4_label *label)
{
@@ -1680,13 +1678,10 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
struct inode *dir = d_inode(parent);
struct inode *inode;
struct dentry *d;
- int error = -EACCES;
+ int error;
d_drop(dentry);
- /* We may have been initialized further down */
- if (d_really_is_positive(dentry))
- goto out;
if (fhandle->size == 0) {
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
if (error)
@@ -1702,18 +1697,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
}
inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
d = d_splice_alias(inode, dentry);
- if (IS_ERR(d)) {
- error = PTR_ERR(d);
- goto out_error;
- }
- dput(d);
out:
dput(parent);
- return 0;
+ return d;
out_error:
nfs_mark_for_revalidate(dir);
- dput(parent);
- return error;
+ d = ERR_PTR(error);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr,
+ struct nfs4_label *label)
+{
+ struct dentry *d;
+
+ d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+
+ /* Callers don't care */
+ dput(d);
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_instantiate);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 222d7115db71..040a50fd9bf3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -64,13 +64,6 @@
static struct kmem_cache *nfs_direct_cachep;
-/*
- * This represents a set of asynchronous requests that we're waiting on
- */
-struct nfs_direct_mirror {
- ssize_t count;
-};
-
struct nfs_direct_req {
struct kref kref; /* release manager */
@@ -84,9 +77,6 @@ struct nfs_direct_req {
atomic_t io_count; /* i/os we're waiting for */
spinlock_t lock; /* protect completion state */
- struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
- int mirror_count;
-
loff_t io_start; /* Start offset for I/O */
ssize_t count, /* bytes actually processed */
max_count, /* max expected count */
@@ -123,32 +113,42 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
}
static void
-nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr,
+ ssize_t dreq_len)
{
- int i;
- ssize_t count;
+ if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
+ test_bit(NFS_IOHDR_EOF, &hdr->flags)))
+ return;
+ if (dreq->max_count >= dreq_len) {
+ dreq->max_count = dreq_len;
+ if (dreq->count > dreq_len)
+ dreq->count = dreq_len;
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ dreq->error = hdr->error;
+ else /* Clear outstanding error if this is EOF */
+ dreq->error = 0;
+ }
+}
- WARN_ON_ONCE(dreq->count >= dreq->max_count);
+static void
+nfs_direct_count_bytes(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr)
+{
+ loff_t hdr_end = hdr->io_start + hdr->good_bytes;
+ ssize_t dreq_len = 0;
- if (dreq->mirror_count == 1) {
- dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
- dreq->count += hdr->good_bytes;
- } else {
- /* mirrored writes */
- count = dreq->mirrors[hdr->pgio_mirror_idx].count;
- if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
- count = hdr->io_start + hdr->good_bytes - dreq->io_start;
- dreq->mirrors[hdr->pgio_mirror_idx].count = count;
- }
- /* update the dreq->count by finding the minimum agreed count from all
- * mirrors */
- count = dreq->mirrors[0].count;
+ if (hdr_end > dreq->io_start)
+ dreq_len = hdr_end - dreq->io_start;
- for (i = 1; i < dreq->mirror_count; i++)
- count = min(count, dreq->mirrors[i].count);
+ nfs_direct_handle_truncated(dreq, hdr, dreq_len);
- dreq->count = count;
- }
+ if (dreq_len > dreq->max_count)
+ dreq_len = dreq->max_count;
+
+ if (dreq->count < dreq_len)
+ dreq->count = dreq_len;
}
/*
@@ -293,18 +293,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
cinfo->completion_ops = &nfs_direct_commit_completion_ops;
}
-static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
- struct nfs_pageio_descriptor *pgio,
- struct nfs_page *req)
-{
- int mirror_count = 1;
-
- if (pgio->pg_ops->pg_get_mirror_count)
- mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
-
- dreq->mirror_count = mirror_count;
-}
-
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
struct nfs_direct_req *dreq;
@@ -319,7 +307,6 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
- dreq->mirror_count = 1;
spin_lock_init(&dreq->lock);
return dreq;
@@ -402,20 +389,12 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
struct nfs_direct_req *dreq = hdr->dreq;
spin_lock(&dreq->lock);
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
spin_unlock(&dreq->lock);
goto out_put;
}
- if (hdr->good_bytes != 0)
- nfs_direct_good_bytes(dreq, hdr);
-
- if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
- dreq->error = 0;
-
+ nfs_direct_count_bytes(dreq, hdr);
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
@@ -646,29 +625,22 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
LIST_HEAD(reqs);
struct nfs_commit_info cinfo;
LIST_HEAD(failed);
- int i;
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ dreq->max_count = 0;
+ list_for_each_entry(req, &reqs, wb_list)
+ dreq->max_count += req->wb_bytes;
dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
- for (i = 0; i < dreq->mirror_count; i++)
- dreq->mirrors[i].count = 0;
get_dreq(dreq);
nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
- req = nfs_list_entry(reqs.next);
- nfs_direct_setup_mirroring(dreq, &desc, req);
- if (desc.pg_error < 0) {
- list_splice_init(&reqs, &failed);
- goto out_failed;
- }
-
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
/* Bump the transmission count */
req->wb_nio++;
@@ -686,7 +658,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
}
nfs_pageio_complete(&desc);
-out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -791,17 +762,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
-
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
spin_unlock(&dreq->lock);
goto out_put;
}
+ nfs_direct_count_bytes(dreq, hdr);
if (hdr->good_bytes != 0) {
- nfs_direct_good_bytes(dreq, hdr);
if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
request_commit = true;
@@ -923,7 +890,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
break;
}
- nfs_direct_setup_mirroring(dreq, &desc, req);
if (desc.pg_error < 0) {
nfs_free_request(req);
result = desc.pg_error;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3cb073c50fa6..c9b605f6c9cb 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
.owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTGET_ON_OPEN,
.max_layoutget_response = 4096, /* 1 page or so... */
.alloc_layout_hdr = filelayout_alloc_layout_hdr,
.free_layout_hdr = filelayout_free_layout_hdr,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e64f810223be..447a3c17fa8e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops;
struct nfs_string;
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- * their needs. People that do NFS over a slow network, might for
- * instance want to reduce it to something closer to 1 for improved
- * interactive response.
- */
-#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
-
static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
{
if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a3ad2d46fd42..9eb2f1a503ab 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void)
return data;
}
-static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+static struct dentry *
+nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
{
int status;
status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
nfs_post_op_update_inode(dir, data->res.dir_attr);
- if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
- return status;
+ if (status != 0)
+ return ERR_PTR(status);
+
+ return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
}
static void nfs3_free_createdata(struct nfs3_createdata *data)
@@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
+ struct dentry *d_alias;
int status = -ENOMEM;
dprintk("NFS call create %pd\n", dentry);
@@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
goto out;
for (;;) {
- status = nfs3_do_create(dir, dentry, data);
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
if (status != -ENOTSUPP)
break;
@@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
if (status != 0)
goto out_release_acls;
+ if (d_alias)
+ dentry = d_alias;
+
/* When we created the file with exclusive semantics, make
* sure we set the attributes afterwards. */
if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
dprintk("NFS reply setattr (post-create): %d\n", status);
if (status != 0)
- goto out_release_acls;
+ goto out_dput;
}
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+out_dput:
+ dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
@@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
unsigned int len, struct iattr *sattr)
{
struct nfs3_createdata *data;
+ struct dentry *d_alias;
int status = -ENOMEM;
if (len > NFS3_MAXPATHLEN)
@@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
data->arg.symlink.pathlen = len;
data->arg.symlink.sattr = sattr;
- status = nfs3_do_create(dir, dentry, data);
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
+ if (status == 0)
+ dput(d_alias);
nfs3_free_createdata(data);
out:
@@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
+ struct dentry *d_alias;
int status = -ENOMEM;
dprintk("NFS call mkdir %pd\n", dentry);
@@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
data->arg.mkdir.len = dentry->d_name.len;
data->arg.mkdir.sattr = sattr;
- status = nfs3_do_create(dir, dentry, data);
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
+
if (status != 0)
goto out_release_acls;
+ if (d_alias)
+ dentry = d_alias;
+
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+ dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
@@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
+ struct dentry *d_alias;
int status = -ENOMEM;
dprintk("NFS call mknod %pd %u:%u\n", dentry,
@@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
goto out;
}
- status = nfs3_do_create(dir, dentry, data);
+ d_alias = nfs3_do_create(dir, dentry, data);
+ status = PTR_ERR_OR_ZERO(d_alias);
if (status != 0)
goto out_release_acls;
+ if (d_alias)
+ dentry = d_alias;
+
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+ dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3564da1ba8a1..16b2e5cc3e94 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
const struct nfs_lock_context *, nfs4_stateid *,
const struct cred **);
-extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
- struct nfs4_state *state);
extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
struct nfs4_state *state);
@@ -574,6 +572,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat
return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
}
+static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1)
+{
+ u32 seqid = be32_to_cpu(s1->seqid);
+
+ if (++seqid == 0)
+ ++seqid;
+ s1->seqid = cpu_to_be32(seqid);
+}
+
static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
{
return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1406858bae6c..ab8ca20fd579 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1073,14 +1073,26 @@ static const struct rpc_call_ops nfs40_call_sync_ops = {
.rpc_call_done = nfs40_call_sync_done,
};
+static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup)
+{
+ int ret;
+ struct rpc_task *task;
+
+ task = rpc_run_task(task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ ret = task->tk_status;
+ rpc_put_task(task);
+ return ret;
+}
+
static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
struct nfs_server *server,
struct rpc_message *msg,
struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res)
{
- int ret;
- struct rpc_task *task;
struct nfs_client *clp = server->nfs_client;
struct nfs4_call_sync_data data = {
.seq_server = server,
@@ -1094,14 +1106,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
.callback_data = &data
};
- task = rpc_run_task(&task_setup);
- if (IS_ERR(task))
- ret = PTR_ERR(task);
- else {
- ret = task->tk_status;
- rpc_put_task(task);
- }
- return ret;
+ return nfs4_call_sync_custom(&task_setup);
}
int nfs4_call_sync(struct rpc_clnt *clnt,
@@ -3308,6 +3313,75 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task)
return pnfs_wait_on_layoutreturn(inode, task);
}
+/*
+ * Update the seqid of an open stateid
+ */
+static void nfs4_sync_open_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state)
+{
+ __be32 seqid_open;
+ u32 dst_seqid;
+ int seq;
+
+ for (;;) {
+ if (!nfs4_valid_open_stateid(state))
+ break;
+ seq = read_seqbegin(&state->seqlock);
+ if (!nfs4_state_match_open_stateid_other(state, dst)) {
+ nfs4_stateid_copy(dst, &state->open_stateid);
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+ break;
+ }
+ seqid_open = state->open_stateid.seqid;
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+
+ dst_seqid = be32_to_cpu(dst->seqid);
+ if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0)
+ dst->seqid = seqid_open;
+ break;
+ }
+}
+
+/*
+ * Update the seqid of an open stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
+ struct nfs4_state *state)
+{
+ __be32 seqid_open;
+ u32 dst_seqid;
+ bool ret;
+ int seq;
+
+ for (;;) {
+ ret = false;
+ if (!nfs4_valid_open_stateid(state))
+ break;
+ seq = read_seqbegin(&state->seqlock);
+ if (!nfs4_state_match_open_stateid_other(state, dst)) {
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+ break;
+ }
+ seqid_open = state->open_stateid.seqid;
+ if (read_seqretry(&state->seqlock, seq))
+ continue;
+
+ dst_seqid = be32_to_cpu(dst->seqid);
+ if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0)
+ dst->seqid = cpu_to_be32(dst_seqid + 1);
+ else
+ dst->seqid = seqid_open;
+ ret = true;
+ break;
+ }
+
+ return ret;
+}
+
struct nfs4_closedata {
struct inode *inode;
struct nfs4_state *state;
@@ -3358,32 +3432,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
/* Handle Layoutreturn errors */
- if (calldata->arg.lr_args && task->tk_status != 0) {
- switch (calldata->res.lr_ret) {
- default:
- calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
- break;
- case 0:
- calldata->arg.lr_args = NULL;
- calldata->res.lr_res = NULL;
- break;
- case -NFS4ERR_OLD_STATEID:
- if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid,
- &calldata->arg.lr_args->range,
- calldata->inode))
- goto lr_restart;
- /* Fallthrough */
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_EXPIRED:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
- case -NFS4ERR_WRONG_CRED:
- calldata->arg.lr_args = NULL;
- calldata->res.lr_res = NULL;
- goto lr_restart;
- }
- }
+ if (pnfs_roc_done(task, calldata->inode,
+ &calldata->arg.lr_args,
+ &calldata->res.lr_res,
+ &calldata->res.lr_ret) == -EAGAIN)
+ goto out_restart;
/* hmm. we are done with the inode, and in the process of freeing
* the state_owner. we keep this around to process errors
@@ -3403,7 +3456,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
break;
case -NFS4ERR_OLD_STATEID:
/* Did we race with OPEN? */
- if (nfs4_refresh_open_stateid(&calldata->arg.stateid,
+ if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid,
state))
goto out_restart;
goto out_release;
@@ -3415,7 +3468,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
task->tk_msg.rpc_cred);
/* Fallthrough */
case -NFS4ERR_BAD_STATEID:
- break;
+ if (calldata->arg.fmode == 0)
+ break;
+ /* Fallthrough */
default:
task->tk_status = nfs4_async_handle_exception(task,
server, task->tk_status, &exception);
@@ -3430,8 +3485,6 @@ out_release:
nfs_refresh_inode(calldata->inode, &calldata->fattr);
dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
return;
-lr_restart:
- calldata->res.lr_ret = 0;
out_restart:
task->tk_status = 0;
rpc_restart_call_prepare(task);
@@ -3472,8 +3525,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
} else if (is_rdwr)
calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
- if (!nfs4_valid_open_stateid(state) ||
- !nfs4_refresh_open_stateid(&calldata->arg.stateid, state))
+ nfs4_sync_open_stateid(&calldata->arg.stateid, state);
+ if (!nfs4_valid_open_stateid(state))
call_close = 0;
spin_unlock(&state->owner->so_lock);
@@ -6018,7 +6071,6 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
.rpc_resp = res,
.rpc_cred = cred,
};
- struct rpc_task *task;
struct rpc_task_setup task_setup_data = {
.rpc_client = clp->cl_rpcclient,
.rpc_message = &msg,
@@ -6051,17 +6103,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
dprintk("NFS call setclientid auth=%s, '%s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
clp->cl_owner_id);
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- status = PTR_ERR(task);
- goto out;
- }
- status = task->tk_status;
+
+ status = nfs4_call_sync_custom(&task_setup_data);
if (setclientid.sc_cred) {
+ kfree(clp->cl_acceptor);
clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
put_rpccred(setclientid.sc_cred);
}
- rpc_put_task(task);
out:
trace_nfs4_setclientid(clp, status);
dprintk("NFS reply setclientid: %d\n", status);
@@ -6129,32 +6177,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
/* Handle Layoutreturn errors */
- if (data->args.lr_args && task->tk_status != 0) {
- switch(data->res.lr_ret) {
- default:
- data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
- break;
- case 0:
- data->args.lr_args = NULL;
- data->res.lr_res = NULL;
- break;
- case -NFS4ERR_OLD_STATEID:
- if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid,
- &data->args.lr_args->range,
- data->inode))
- goto lr_restart;
- /* Fallthrough */
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_EXPIRED:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
- case -NFS4ERR_WRONG_CRED:
- data->args.lr_args = NULL;
- data->res.lr_res = NULL;
- goto lr_restart;
- }
- }
+ if (pnfs_roc_done(task, data->inode,
+ &data->args.lr_args,
+ &data->res.lr_res,
+ &data->res.lr_ret) == -EAGAIN)
+ goto out_restart;
switch (task->tk_status) {
case 0:
@@ -6192,8 +6219,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
}
data->rpc_status = task->tk_status;
return;
-lr_restart:
- data->res.lr_ret = 0;
out_restart:
task->tk_status = 0;
rpc_restart_call_prepare(task);
@@ -6386,6 +6411,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
return err;
}
+/*
+ * Update the seqid of a lock stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst,
+ struct nfs4_lock_state *lsp)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret = false;
+
+ spin_lock(&state->state_lock);
+ if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid))
+ goto out;
+ if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst))
+ nfs4_stateid_seqid_inc(dst);
+ else
+ dst->seqid = lsp->ls_stateid.seqid;
+ ret = true;
+out:
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
+static bool nfs4_sync_lock_stateid(nfs4_stateid *dst,
+ struct nfs4_lock_state *lsp)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret;
+
+ spin_lock(&state->state_lock);
+ ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid);
+ nfs4_stateid_copy(dst, &lsp->ls_stateid);
+ spin_unlock(&state->state_lock);
+ return ret;
+}
+
struct nfs4_unlockdata {
struct nfs_locku_args arg;
struct nfs_locku_res res;
@@ -6403,7 +6464,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
struct nfs_seqid *seqid)
{
struct nfs4_unlockdata *p;
- struct inode *inode = lsp->ls_state->inode;
+ struct nfs4_state *state = lsp->ls_state;
+ struct inode *inode = state->inode;
p = kzalloc(sizeof(*p), GFP_NOFS);
if (p == NULL)
@@ -6419,6 +6481,9 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
locks_init_lock(&p->fl);
locks_copy_lock(&p->fl, fl);
p->server = NFS_SERVER(inode);
+ spin_lock(&state->state_lock);
+ nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid);
+ spin_unlock(&state->state_lock);
return p;
}
@@ -6457,10 +6522,14 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
task->tk_msg.rpc_cred);
/* Fall through */
case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_STALE_STATEID:
- if (!nfs4_stateid_match(&calldata->arg.stateid,
- &calldata->lsp->ls_stateid))
+ if (nfs4_sync_lock_stateid(&calldata->arg.stateid,
+ calldata->lsp))
+ rpc_restart_call_prepare(task);
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid,
+ calldata->lsp))
rpc_restart_call_prepare(task);
break;
default:
@@ -6483,7 +6552,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
goto out_wait;
- nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
/* Note: exit _without_ running nfs4_locku_done */
goto out_no_action;
@@ -7645,6 +7713,8 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred)
static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
{
int status;
+ struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+ struct nfs_client *clp = NFS_SERVER(dir)->nfs_client;
struct nfs4_secinfo_arg args = {
.dir_fh = NFS_FH(dir),
.name = name,
@@ -7657,26 +7727,37 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
.rpc_argp = &args,
.rpc_resp = &res,
};
- struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+ struct nfs4_call_sync_data data = {
+ .seq_server = NFS_SERVER(dir),
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = clp->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
const struct cred *cred = NULL;
if (use_integrity) {
- clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
- cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
+ clnt = clp->cl_rpcclient;
+ task_setup.rpc_client = clnt;
+
+ cred = nfs4_get_clid_cred(clp);
msg.rpc_cred = cred;
}
dprintk("NFS call secinfo %s\n", name->name);
- nfs4_state_protect(NFS_SERVER(dir)->nfs_client,
- NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+ nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_call_sync_custom(&task_setup);
- status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
- &res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
dprintk("NFS reply secinfo: %d\n", status);
put_cred(cred);
-
return status;
}
@@ -8344,7 +8425,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = {
int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
{
- struct rpc_task *task;
struct nfs4_get_lease_time_args args;
struct nfs4_get_lease_time_res res = {
.lr_fsinfo = fsinfo,
@@ -8366,17 +8446,9 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
.callback_data = &data,
.flags = RPC_TASK_TIMEOUT,
};
- int status;
nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1);
- task = rpc_run_task(&task_setup);
-
- if (IS_ERR(task))
- return PTR_ERR(task);
-
- status = task->tk_status;
- rpc_put_task(task);
- return status;
+ return nfs4_call_sync_custom(&task_setup);
}
#ifdef CONFIG_NFS_V4_1
@@ -8845,7 +8917,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
const struct cred *cred)
{
struct nfs4_reclaim_complete_data *calldata;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
.rpc_cred = cred,
@@ -8854,7 +8925,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
.rpc_client = clp->cl_rpcclient,
.rpc_message = &msg,
.callback_ops = &nfs4_reclaim_complete_call_ops,
- .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
};
int status = -ENOMEM;
@@ -8869,15 +8940,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
msg.rpc_argp = &calldata->arg;
msg.rpc_resp = &calldata->res;
task_setup_data.callback_data = calldata;
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- status = PTR_ERR(task);
- goto out;
- }
- status = rpc_wait_for_completion_task(task);
- if (status == 0)
- status = task->tk_status;
- rpc_put_task(task);
+ status = nfs4_call_sync_custom(&task_setup_data);
out:
dprintk("<-- %s status=%d\n", __func__, status);
return status;
@@ -9103,10 +9166,19 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
if (!nfs41_sequence_process(task, &lrp->res.seq_res))
return;
+ /*
+ * Was there an RPC level error? Assume the call succeeded,
+ * and that we need to release the layout
+ */
+ if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) {
+ lrp->res.lrs_present = 0;
+ return;
+ }
+
server = NFS_SERVER(lrp->args.inode);
switch (task->tk_status) {
case -NFS4ERR_OLD_STATEID:
- if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid,
+ if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid,
&lrp->args.range,
lrp->args.inode))
goto out_restart;
@@ -9362,18 +9434,32 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = &res,
};
struct rpc_clnt *clnt = server->client;
+ struct nfs4_call_sync_data data = {
+ .seq_server = server,
+ .seq_args = &args.seq_args,
+ .seq_res = &res.seq_res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+ .callback_data = &data,
+ .flags = RPC_TASK_NO_ROUND_ROBIN,
+ };
const struct cred *cred = NULL;
int status;
if (use_integrity) {
clnt = server->nfs_client->cl_rpcclient;
+ task_setup.rpc_client = clnt;
+
cred = nfs4_get_clid_cred(server->nfs_client);
msg.rpc_cred = cred;
}
dprintk("--> %s\n", __func__);
- status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
- &res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+ status = nfs4_call_sync_custom(&task_setup);
dprintk("<-- %s status=%d\n", __func__, status);
put_cred(cred);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cad4e064b328..0c6d53dc3672 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1015,22 +1015,6 @@ out:
return ret;
}
-bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
-{
- bool ret;
- int seq;
-
- do {
- ret = false;
- seq = read_seqbegin(&state->seqlock);
- if (nfs4_state_match_open_stateid_other(state, dst)) {
- dst->seqid = state->open_stateid.seqid;
- ret = true;
- }
- } while (read_seqretry(&state->seqlock, seq));
- return ret;
-}
-
bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
{
bool ret;
@@ -2095,8 +2079,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
}
status = nfs4_begin_drain_session(clp);
- if (status != 0)
- return status;
+ if (status != 0) {
+ result = status;
+ goto out;
+ }
status = nfs4_replace_transport(server, locations);
if (status != 0) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 46a8d636d151..ab07db0f07cd 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
} else
*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
- if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+ if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
*p++ = cpu_to_be32(label->lfs);
*p++ = cpu_to_be32(label->pi);
*p++ = cpu_to_be32(label->len);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4525d5acae38..bb80034a7661 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
}
/*
- * Update the seqid of a layout stateid
+ * Update the seqid of a layout stateid after receiving
+ * NFS4ERR_OLD_STATEID
*/
-bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
struct pnfs_layout_range *dst_range,
struct inode *inode)
{
@@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
spin_lock(&inode->i_lock);
lo = NFS_I(inode)->layout;
- if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+ if (lo && pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+ /* Is our call using the most recent seqid? If so, bump it */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
+ nfs4_stateid_seqid_inc(dst);
+ ret = true;
+ goto out;
+ }
+ /* Try to update the seqid to the most recent */
err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
if (err != -EBUSY) {
dst->seqid = lo->plh_stateid.seqid;
@@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
ret = true;
}
}
+out:
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
return ret;
@@ -1440,6 +1450,52 @@ out_noroc:
return false;
}
+int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+ struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp,
+ int *ret)
+{
+ struct nfs4_layoutreturn_args *arg = *argpp;
+ int retval = -EAGAIN;
+
+ if (!arg)
+ return 0;
+ /* Handle Layoutreturn errors */
+ switch (*ret) {
+ case 0:
+ retval = 0;
+ break;
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ /* Was there an RPC level error? If not, retry */
+ if (task->tk_rpc_status == 0)
+ break;
+ /* If the call was not sent, let caller handle it */
+ if (!RPC_WAS_SENT(task))
+ return 0;
+ /*
+ * Otherwise, assume the call succeeded and
+ * that we need to release the layout
+ */
+ *ret = 0;
+ (*respp)->lrs_present = 0;
+ retval = 0;
+ break;
+ case -NFS4ERR_DELAY:
+ /* Let the caller handle the retry */
+ *ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ return 0;
+ case -NFS4ERR_OLD_STATEID:
+ if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
+ &arg->range, inode))
+ break;
+ *ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ return -EAGAIN;
+ }
+ *argpp = NULL;
+ *respp = NULL;
+ return retval;
+}
+
void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
int ret)
@@ -1449,10 +1505,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
const nfs4_stateid *res_stateid = NULL;
struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
- if (ret == 0) {
- arg_stateid = &args->stateid;
+ switch (ret) {
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ break;
+ case 0:
if (res->lrs_present)
res_stateid = &res->stateid;
+ /* Fallthrough */
+ default:
+ arg_stateid = &args->stateid;
}
pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
res_stateid);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f15609c003d8..f8a38065c7e4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
bool is_recall);
int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
bool is_recall);
-bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
struct pnfs_layout_range *dst_range,
struct inode *inode);
void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino,
struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
const struct cred *cred);
+int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+ struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp,
+ int *ret);
void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
int ret);
@@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino,
return false;
}
+static inline int
+pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+ struct nfs4_layoutreturn_args **argpp,
+ struct nfs4_layoutreturn_res **respp,
+ int *ret)
+{
+ return 0;
+}
+
static inline void
pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
@@ -785,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
{
}
-static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
struct pnfs_layout_range *dst_range,
struct inode *inode)
{
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 19a76cfa8b1f..a84df7d63403 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2645,6 +2645,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
}
EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
+static void nfs_set_readahead(struct backing_dev_info *bdi,
+ unsigned long iomax_pages)
+{
+ bdi->ra_pages = VM_READAHEAD_PAGES;
+ bdi->io_pages = iomax_pages;
+}
+
struct dentry *nfs_fs_mount_common(struct nfs_server *server,
int flags, const char *dev_name,
struct nfs_mount_info *mount_info,
@@ -2687,7 +2694,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
mntroot = ERR_PTR(error);
goto error_splat_super;
}
- s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+ nfs_set_readahead(s->s_bdi, server->rpages);
server->super = s;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 85ca49549b39..52cab65f91cf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -786,7 +786,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *head;
- atomic_long_dec(&nfsi->nrequests);
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
head = req->wb_head;
@@ -799,8 +798,10 @@ static void nfs_inode_remove_request(struct nfs_page *req)
spin_unlock(&mapping->private_lock);
}
- if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
nfs_release_request(req);
+ atomic_long_dec(&nfsi->nrequests);
+ }
}
static void
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index d25f6bbe7006..10cefb0c07c7 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -3,6 +3,7 @@ config NFSD
tristate "NFS server support"
depends on INET
depends on FILE_LOCKING
+ depends on FSNOTIFY
select LOCKD
select SUNRPC
select EXPORTFS
@@ -147,7 +148,7 @@ config NFSD_V4_SECURITY_LABEL
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
- depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS
+ depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN
help
This option enables support for manually injecting faults
into the NFS server. This is intended to be used for
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 2bfb58eefad1..6a40b1afe703 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -11,7 +11,8 @@ obj-$(CONFIG_NFSD) += nfsd.o
nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
- export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+ export.o auth.o lockd.o nfscache.o nfsxdr.o \
+ stats.o filecache.o
nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 4cd7c69a6cb9..ba14d2f4b64f 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -39,14 +39,6 @@ struct nfs4_acl;
struct svc_fh;
struct svc_rqst;
-/*
- * Maximum ACL we'll accept from a client; chosen (somewhat
- * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
- * high-order allocation. This allows 204 ACEs on x86_64:
- */
-#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
- / sizeof(struct nfs4_ace))
-
int nfs4_acl_bytes(int entries);
int nfs4_acl_get_whotype(char *, u32);
__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 66d4c55eb48e..9bbaa671c079 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -15,6 +15,7 @@
#include "blocklayoutxdr.h"
#include "pnfs.h"
+#include "filecache.h"
#define NFSDDBG_FACILITY NFSDDBG_PNFS
@@ -404,7 +405,7 @@ static void
nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
{
struct nfs4_client *clp = ls->ls_stid.sc_client;
- struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+ struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
nfsd4_scsi_pr_key(clp), 0, true);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index baa01956a5b3..15422c951fd1 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -22,6 +22,7 @@
#include "nfsfh.h"
#include "netns.h"
#include "pnfs.h"
+#include "filecache.h"
#define NFSDDBG_FACILITY NFSDDBG_EXPORT
@@ -232,6 +233,17 @@ static struct cache_head *expkey_alloc(void)
return NULL;
}
+static void expkey_flush(void)
+{
+ /*
+ * Take the nfsd_mutex here to ensure that the file cache is not
+ * destroyed while we're in the middle of flushing.
+ */
+ mutex_lock(&nfsd_mutex);
+ nfsd_file_cache_purge(current->nsproxy->net_ns);
+ mutex_unlock(&nfsd_mutex);
+}
+
static const struct cache_detail svc_expkey_cache_template = {
.owner = THIS_MODULE,
.hash_size = EXPKEY_HASHMAX,
@@ -244,6 +256,7 @@ static const struct cache_detail svc_expkey_cache_template = {
.init = expkey_init,
.update = expkey_update,
.alloc = expkey_alloc,
+ .flush = expkey_flush,
};
static int
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
new file mode 100644
index 000000000000..ef55e9b1cd4e
--- /dev/null
+++ b/fs/nfsd/filecache.c
@@ -0,0 +1,934 @@
+/*
+ * Open file cache.
+ *
+ * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ */
+
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/list_lru.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+
+#include "vfs.h"
+#include "nfsd.h"
+#include "nfsfh.h"
+#include "netns.h"
+#include "filecache.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_FH
+
+/* FIXME: dynamically size this for the machine somehow? */
+#define NFSD_FILE_HASH_BITS 12
+#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS)
+#define NFSD_LAUNDRETTE_DELAY (2 * HZ)
+
+#define NFSD_FILE_LRU_RESCAN (0)
+#define NFSD_FILE_SHUTDOWN (1)
+#define NFSD_FILE_LRU_THRESHOLD (4096UL)
+#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2)
+
+/* We only care about NFSD_MAY_READ/WRITE for this cache */
+#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
+
+struct nfsd_fcache_bucket {
+ struct hlist_head nfb_head;
+ spinlock_t nfb_lock;
+ unsigned int nfb_count;
+ unsigned int nfb_maxcount;
+};
+
+static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+
+static struct kmem_cache *nfsd_file_slab;
+static struct kmem_cache *nfsd_file_mark_slab;
+static struct nfsd_fcache_bucket *nfsd_file_hashtbl;
+static struct list_lru nfsd_file_lru;
+static long nfsd_file_lru_flags;
+static struct fsnotify_group *nfsd_file_fsnotify_group;
+static atomic_long_t nfsd_filecache_count;
+static struct delayed_work nfsd_filecache_laundrette;
+
+enum nfsd_file_laundrette_ctl {
+ NFSD_FILE_LAUNDRETTE_NOFLUSH = 0,
+ NFSD_FILE_LAUNDRETTE_MAY_FLUSH
+};
+
+static void
+nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl)
+{
+ long count = atomic_long_read(&nfsd_filecache_count);
+
+ if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
+ return;
+
+ /* Be more aggressive about scanning if over the threshold */
+ if (count > NFSD_FILE_LRU_THRESHOLD)
+ mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0);
+ else
+ schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY);
+
+ if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH)
+ return;
+
+ /* ...and don't delay flushing if we're out of control */
+ if (count >= NFSD_FILE_LRU_LIMIT)
+ flush_delayed_work(&nfsd_filecache_laundrette);
+}
+
+static void
+nfsd_file_slab_free(struct rcu_head *rcu)
+{
+ struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu);
+
+ put_cred(nf->nf_cred);
+ kmem_cache_free(nfsd_file_slab, nf);
+}
+
+static void
+nfsd_file_mark_free(struct fsnotify_mark *mark)
+{
+ struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark,
+ nfm_mark);
+
+ kmem_cache_free(nfsd_file_mark_slab, nfm);
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_get(struct nfsd_file_mark *nfm)
+{
+ if (!atomic_inc_not_zero(&nfm->nfm_ref))
+ return NULL;
+ return nfm;
+}
+
+static void
+nfsd_file_mark_put(struct nfsd_file_mark *nfm)
+{
+ if (atomic_dec_and_test(&nfm->nfm_ref)) {
+
+ fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group);
+ fsnotify_put_mark(&nfm->nfm_mark);
+ }
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+{
+ int err;
+ struct fsnotify_mark *mark;
+ struct nfsd_file_mark *nfm = NULL, *new;
+ struct inode *inode = nf->nf_inode;
+
+ do {
+ mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
+ mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
+ nfsd_file_fsnotify_group);
+ if (mark) {
+ nfm = nfsd_file_mark_get(container_of(mark,
+ struct nfsd_file_mark,
+ nfm_mark));
+ mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_put_mark(mark);
+ if (likely(nfm))
+ break;
+ } else
+ mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+
+ /* allocate a new nfm */
+ new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
+ if (!new)
+ return NULL;
+ fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group);
+ new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF;
+ atomic_set(&new->nfm_ref, 1);
+
+ err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0);
+
+ /*
+ * If the add was successful, then return the object.
+ * Otherwise, we need to put the reference we hold on the
+ * nfm_mark. The fsnotify code will take a reference and put
+ * it on failure, so we can't just free it directly. It's also
+ * not safe to call fsnotify_destroy_mark on it as the
+ * mark->group will be NULL. Thus, we can't let the nfm_ref
+ * counter drive the destruction at this point.
+ */
+ if (likely(!err))
+ nfm = new;
+ else
+ fsnotify_put_mark(&new->nfm_mark);
+ } while (unlikely(err == -EEXIST));
+
+ return nfm;
+}
+
+static struct nfsd_file *
+nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
+ struct net *net)
+{
+ struct nfsd_file *nf;
+
+ nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
+ if (nf) {
+ INIT_HLIST_NODE(&nf->nf_node);
+ INIT_LIST_HEAD(&nf->nf_lru);
+ nf->nf_file = NULL;
+ nf->nf_cred = get_current_cred();
+ nf->nf_net = net;
+ nf->nf_flags = 0;
+ nf->nf_inode = inode;
+ nf->nf_hashval = hashval;
+ atomic_set(&nf->nf_ref, 1);
+ nf->nf_may = may & NFSD_FILE_MAY_MASK;
+ if (may & NFSD_MAY_NOT_BREAK_LEASE) {
+ if (may & NFSD_MAY_WRITE)
+ __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
+ if (may & NFSD_MAY_READ)
+ __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+ }
+ nf->nf_mark = NULL;
+ trace_nfsd_file_alloc(nf);
+ }
+ return nf;
+}
+
+static bool
+nfsd_file_free(struct nfsd_file *nf)
+{
+ bool flush = false;
+
+ trace_nfsd_file_put_final(nf);
+ if (nf->nf_mark)
+ nfsd_file_mark_put(nf->nf_mark);
+ if (nf->nf_file) {
+ get_file(nf->nf_file);
+ filp_close(nf->nf_file, NULL);
+ fput(nf->nf_file);
+ flush = true;
+ }
+ call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
+ return flush;
+}
+
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+ struct address_space *mapping;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return false;
+ mapping = file->f_mapping;
+ return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+}
+
+static int
+nfsd_file_check_write_error(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return 0;
+ return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+}
+
+static bool
+nfsd_file_in_use(struct nfsd_file *nf)
+{
+ return nfsd_file_check_writeback(nf) ||
+ nfsd_file_check_write_error(nf);
+}
+
+static void
+nfsd_file_do_unhash(struct nfsd_file *nf)
+{
+ lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+ trace_nfsd_file_unhash(nf);
+
+ if (nfsd_file_check_write_error(nf))
+ nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
+ --nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
+ hlist_del_rcu(&nf->nf_node);
+ if (!list_empty(&nf->nf_lru))
+ list_lru_del(&nfsd_file_lru, &nf->nf_lru);
+ atomic_long_dec(&nfsd_filecache_count);
+}
+
+static bool
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+ if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_do_unhash(nf);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Return true if the file was unhashed.
+ */
+static bool
+nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
+{
+ lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+ trace_nfsd_file_unhash_and_release_locked(nf);
+ if (!nfsd_file_unhash(nf))
+ return false;
+ /* keep final reference for nfsd_file_lru_dispose */
+ if (atomic_add_unless(&nf->nf_ref, -1, 1))
+ return true;
+
+ list_add(&nf->nf_lru, dispose);
+ return true;
+}
+
+static int
+nfsd_file_put_noref(struct nfsd_file *nf)
+{
+ int count;
+ trace_nfsd_file_put(nf);
+
+ count = atomic_dec_return(&nf->nf_ref);
+ if (!count) {
+ WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+ nfsd_file_free(nf);
+ }
+ return count;
+}
+
+void
+nfsd_file_put(struct nfsd_file *nf)
+{
+ bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
+ bool unused = !nfsd_file_in_use(nf);
+
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused)
+ nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH);
+}
+
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
+{
+ if (likely(atomic_inc_not_zero(&nf->nf_ref)))
+ return nf;
+ return NULL;
+}
+
+static void
+nfsd_file_dispose_list(struct list_head *dispose)
+{
+ struct nfsd_file *nf;
+
+ while(!list_empty(dispose)) {
+ nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+ list_del(&nf->nf_lru);
+ nfsd_file_put_noref(nf);
+ }
+}
+
+static void
+nfsd_file_dispose_list_sync(struct list_head *dispose)
+{
+ bool flush = false;
+ struct nfsd_file *nf;
+
+ while(!list_empty(dispose)) {
+ nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+ list_del(&nf->nf_lru);
+ if (!atomic_dec_and_test(&nf->nf_ref))
+ continue;
+ if (nfsd_file_free(nf))
+ flush = true;
+ }
+ if (flush)
+ flush_delayed_fput();
+}
+
+/*
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static enum lru_status
+nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
+ spinlock_t *lock, void *arg)
+ __releases(lock)
+ __acquires(lock)
+{
+ struct list_head *head = arg;
+ struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
+
+ /*
+ * Do a lockless refcount check. The hashtable holds one reference, so
+ * we look to see if anything else has a reference, or if any have
+ * been put since the shrinker last ran. Those don't get unhashed and
+ * released.
+ *
+ * Note that in the put path, we set the flag and then decrement the
+ * counter. Here we check the counter and then test and clear the flag.
+ * That order is deliberate to ensure that we can do this locklessly.
+ */
+ if (atomic_read(&nf->nf_ref) > 1)
+ goto out_skip;
+
+ /*
+ * Don't throw out files that are still undergoing I/O or
+ * that have uncleared errors pending.
+ */
+ if (nfsd_file_check_writeback(nf))
+ goto out_skip;
+
+ if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
+ goto out_rescan;
+
+ if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+ goto out_skip;
+
+ list_lru_isolate_move(lru, &nf->nf_lru, head);
+ return LRU_REMOVED;
+out_rescan:
+ set_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags);
+out_skip:
+ return LRU_SKIP;
+}
+
+static void
+nfsd_file_lru_dispose(struct list_head *head)
+{
+ while(!list_empty(head)) {
+ struct nfsd_file *nf = list_first_entry(head,
+ struct nfsd_file, nf_lru);
+ list_del_init(&nf->nf_lru);
+ spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ nfsd_file_do_unhash(nf);
+ spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ nfsd_file_put_noref(nf);
+ }
+}
+
+static unsigned long
+nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
+{
+ return list_lru_count(&nfsd_file_lru);
+}
+
+static unsigned long
+nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
+{
+ LIST_HEAD(head);
+ unsigned long ret;
+
+ ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head);
+ nfsd_file_lru_dispose(&head);
+ return ret;
+}
+
+static struct shrinker nfsd_file_shrinker = {
+ .scan_objects = nfsd_file_lru_scan,
+ .count_objects = nfsd_file_lru_count,
+ .seeks = 1,
+};
+
+static void
+__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
+ struct list_head *dispose)
+{
+ struct nfsd_file *nf;
+ struct hlist_node *tmp;
+
+ spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
+ if (inode == nf->nf_inode)
+ nfsd_file_unhash_and_release_locked(nf, dispose);
+ }
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put. Also ensure that any of the
+ * fputs also have their final __fput done as well.
+ */
+void
+nfsd_file_close_inode_sync(struct inode *inode)
+{
+ unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
+ NFSD_FILE_HASH_BITS);
+ LIST_HEAD(dispose);
+
+ __nfsd_file_close_inode(inode, hashval, &dispose);
+ trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
+ nfsd_file_dispose_list_sync(&dispose);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put.
+ */
+static void
+nfsd_file_close_inode(struct inode *inode)
+{
+ unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
+ NFSD_FILE_HASH_BITS);
+ LIST_HEAD(dispose);
+
+ __nfsd_file_close_inode(inode, hashval, &dispose);
+ trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+ nfsd_file_dispose_list(&dispose);
+}
+
+/**
+ * nfsd_file_delayed_close - close unused nfsd_files
+ * @work: dummy
+ *
+ * Walk the LRU list and close any entries that have not been used since
+ * the last scan.
+ *
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static void
+nfsd_file_delayed_close(struct work_struct *work)
+{
+ LIST_HEAD(head);
+
+ list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX);
+
+ if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags))
+ nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH);
+
+ if (!list_empty(&head)) {
+ nfsd_file_lru_dispose(&head);
+ flush_delayed_fput();
+ }
+}
+
+static int
+nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
+ void *data)
+{
+ struct file_lock *fl = data;
+
+ /* Only close files for F_SETLEASE leases */
+ if (fl->fl_flags & FL_LEASE)
+ nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+ return 0;
+}
+
+static struct notifier_block nfsd_file_lease_notifier = {
+ .notifier_call = nfsd_file_lease_notifier_call,
+};
+
+static int
+nfsd_file_fsnotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
+ u32 mask, const void *data, int data_type,
+ const struct qstr *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
+{
+ trace_nfsd_file_fsnotify_handle_event(inode, mask);
+
+ /* Should be no marks on non-regular files */
+ if (!S_ISREG(inode->i_mode)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ /* don't close files if this was not the last link */
+ if (mask & FS_ATTRIB) {
+ if (inode->i_nlink)
+ return 0;
+ }
+
+ nfsd_file_close_inode(inode);
+ return 0;
+}
+
+
+static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
+ .handle_event = nfsd_file_fsnotify_handle_event,
+ .free_mark = nfsd_file_mark_free,
+};
+
+int
+nfsd_file_cache_init(void)
+{
+ int ret = -ENOMEM;
+ unsigned int i;
+
+ clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+ if (nfsd_file_hashtbl)
+ return 0;
+
+ nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+ sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
+ if (!nfsd_file_hashtbl) {
+ pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
+ goto out_err;
+ }
+
+ nfsd_file_slab = kmem_cache_create("nfsd_file",
+ sizeof(struct nfsd_file), 0, 0, NULL);
+ if (!nfsd_file_slab) {
+ pr_err("nfsd: unable to create nfsd_file_slab\n");
+ goto out_err;
+ }
+
+ nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
+ sizeof(struct nfsd_file_mark), 0, 0, NULL);
+ if (!nfsd_file_mark_slab) {
+ pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
+ goto out_err;
+ }
+
+
+ ret = list_lru_init(&nfsd_file_lru);
+ if (ret) {
+ pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
+ goto out_err;
+ }
+
+ ret = register_shrinker(&nfsd_file_shrinker);
+ if (ret) {
+ pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+ goto out_lru;
+ }
+
+ ret = lease_register_notifier(&nfsd_file_lease_notifier);
+ if (ret) {
+ pr_err("nfsd: unable to register lease notifier: %d\n", ret);
+ goto out_shrinker;
+ }
+
+ nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+ if (IS_ERR(nfsd_file_fsnotify_group)) {
+ pr_err("nfsd: unable to create fsnotify group: %ld\n",
+ PTR_ERR(nfsd_file_fsnotify_group));
+ nfsd_file_fsnotify_group = NULL;
+ goto out_notifier;
+ }
+
+ for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
+ spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
+ }
+
+ INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close);
+out:
+ return ret;
+out_notifier:
+ lease_unregister_notifier(&nfsd_file_lease_notifier);
+out_shrinker:
+ unregister_shrinker(&nfsd_file_shrinker);
+out_lru:
+ list_lru_destroy(&nfsd_file_lru);
+out_err:
+ kmem_cache_destroy(nfsd_file_slab);
+ nfsd_file_slab = NULL;
+ kmem_cache_destroy(nfsd_file_mark_slab);
+ nfsd_file_mark_slab = NULL;
+ kfree(nfsd_file_hashtbl);
+ nfsd_file_hashtbl = NULL;
+ goto out;
+}
+
+/*
+ * Note this can deadlock with nfsd_file_lru_cb.
+ */
+void
+nfsd_file_cache_purge(struct net *net)
+{
+ unsigned int i;
+ struct nfsd_file *nf;
+ struct hlist_node *next;
+ LIST_HEAD(dispose);
+ bool del;
+
+ if (!nfsd_file_hashtbl)
+ return;
+
+ for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+ struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+
+ spin_lock(&nfb->nfb_lock);
+ hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+ if (net && nf->nf_net != net)
+ continue;
+ del = nfsd_file_unhash_and_release_locked(nf, &dispose);
+
+ /*
+ * Deadlock detected! Something marked this entry as
+ * unhased, but hasn't removed it from the hash list.
+ */
+ WARN_ON_ONCE(!del);
+ }
+ spin_unlock(&nfb->nfb_lock);
+ nfsd_file_dispose_list(&dispose);
+ }
+}
+
+void
+nfsd_file_cache_shutdown(void)
+{
+ LIST_HEAD(dispose);
+
+ set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+ lease_unregister_notifier(&nfsd_file_lease_notifier);
+ unregister_shrinker(&nfsd_file_shrinker);
+ /*
+ * make sure all callers of nfsd_file_lru_cb are done before
+ * calling nfsd_file_cache_purge
+ */
+ cancel_delayed_work_sync(&nfsd_filecache_laundrette);
+ nfsd_file_cache_purge(NULL);
+ list_lru_destroy(&nfsd_file_lru);
+ rcu_barrier();
+ fsnotify_put_group(nfsd_file_fsnotify_group);
+ nfsd_file_fsnotify_group = NULL;
+ kmem_cache_destroy(nfsd_file_slab);
+ nfsd_file_slab = NULL;
+ fsnotify_wait_marks_destroyed();
+ kmem_cache_destroy(nfsd_file_mark_slab);
+ nfsd_file_mark_slab = NULL;
+ kfree(nfsd_file_hashtbl);
+ nfsd_file_hashtbl = NULL;
+}
+
+static bool
+nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+{
+ int i;
+
+ if (!uid_eq(c1->fsuid, c2->fsuid))
+ return false;
+ if (!gid_eq(c1->fsgid, c2->fsgid))
+ return false;
+ if (c1->group_info == NULL || c2->group_info == NULL)
+ return c1->group_info == c2->group_info;
+ if (c1->group_info->ngroups != c2->group_info->ngroups)
+ return false;
+ for (i = 0; i < c1->group_info->ngroups; i++) {
+ if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+ return false;
+ }
+ return true;
+}
+
+static struct nfsd_file *
+nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
+ unsigned int hashval, struct net *net)
+{
+ struct nfsd_file *nf;
+ unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+
+ hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+ nf_node) {
+ if ((need & nf->nf_may) != need)
+ continue;
+ if (nf->nf_inode != inode)
+ continue;
+ if (nf->nf_net != net)
+ continue;
+ if (!nfsd_match_cred(nf->nf_cred, current_cred()))
+ continue;
+ if (nfsd_file_get(nf) != NULL)
+ return nf;
+ }
+ return NULL;
+}
+
+/**
+ * nfsd_file_is_cached - are there any cached open files for this fh?
+ * @inode: inode of the file to check
+ *
+ * Scan the hashtable for open files that match this fh. Returns true if there
+ * are any, and false if not.
+ */
+bool
+nfsd_file_is_cached(struct inode *inode)
+{
+ bool ret = false;
+ struct nfsd_file *nf;
+ unsigned int hashval;
+
+ hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+ nf_node) {
+ if (inode == nf->nf_inode) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ trace_nfsd_file_is_cached(inode, hashval, (int)ret);
+ return ret;
+}
+
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ __be32 status;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_file *nf, *new;
+ struct inode *inode;
+ unsigned int hashval;
+
+ /* FIXME: skip this if fh_dentry is already set? */
+ status = fh_verify(rqstp, fhp, S_IFREG,
+ may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ if (status != nfs_ok)
+ return status;
+
+ inode = d_inode(fhp->fh_dentry);
+ hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+retry:
+ rcu_read_lock();
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+ rcu_read_unlock();
+ if (nf)
+ goto wait_for_construction;
+
+ new = nfsd_file_alloc(inode, may_flags, hashval, net);
+ if (!new) {
+ trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
+ NULL, nfserr_jukebox);
+ return nfserr_jukebox;
+ }
+
+ spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+ if (nf == NULL)
+ goto open_file;
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ nfsd_file_slab_free(&new->nf_rcu);
+
+wait_for_construction:
+ wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
+
+ /* Did construction of this file fail? */
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_put_noref(nf);
+ goto retry;
+ }
+
+ this_cpu_inc(nfsd_file_cache_hits);
+
+ if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
+ bool write = (may_flags & NFSD_MAY_WRITE);
+
+ if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
+ (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
+ status = nfserrno(nfsd_open_break_lease(
+ file_inode(nf->nf_file), may_flags));
+ if (status == nfs_ok) {
+ clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+ if (write)
+ clear_bit(NFSD_FILE_BREAK_WRITE,
+ &nf->nf_flags);
+ }
+ }
+ }
+out:
+ if (status == nfs_ok) {
+ *pnf = nf;
+ } else {
+ nfsd_file_put(nf);
+ nf = NULL;
+ }
+
+ trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
+ return status;
+open_file:
+ nf = new;
+ /* Take reference for the hashtable */
+ atomic_inc(&nf->nf_ref);
+ __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
+ __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ list_lru_add(&nfsd_file_lru, &nf->nf_lru);
+ hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
+ ++nfsd_file_hashtbl[hashval].nfb_count;
+ nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
+ nfsd_file_hashtbl[hashval].nfb_count);
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ atomic_long_inc(&nfsd_filecache_count);
+
+ nf->nf_mark = nfsd_file_mark_find_or_create(nf);
+ if (nf->nf_mark)
+ status = nfsd_open_verified(rqstp, fhp, S_IFREG,
+ may_flags, &nf->nf_file);
+ else
+ status = nfserr_jukebox;
+ /*
+ * If construction failed, or we raced with a call to unlink()
+ * then unhash.
+ */
+ if (status != nfs_ok || inode->i_nlink == 0) {
+ bool do_free;
+ spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ do_free = nfsd_file_unhash(nf);
+ spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ if (do_free)
+ nfsd_file_put_noref(nf);
+ }
+ clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+ goto out;
+}
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+{
+ unsigned int i, count = 0, longest = 0;
+ unsigned long hits = 0;
+
+ /*
+ * No need for spinlocks here since we're not terribly interested in
+ * accuracy. We do take the nfsd_mutex simply to ensure that we
+ * don't end up racing with server shutdown
+ */
+ mutex_lock(&nfsd_mutex);
+ if (nfsd_file_hashtbl) {
+ for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+ count += nfsd_file_hashtbl[i].nfb_count;
+ longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
+ }
+ }
+ mutex_unlock(&nfsd_mutex);
+
+ for_each_possible_cpu(i)
+ hits += per_cpu(nfsd_file_cache_hits, i);
+
+ seq_printf(m, "total entries: %u\n", count);
+ seq_printf(m, "longest chain: %u\n", longest);
+ seq_printf(m, "cache hits: %lu\n", hits);
+ return 0;
+}
+
+int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nfsd_file_cache_stats_show, NULL);
+}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
new file mode 100644
index 000000000000..851d9abf54c2
--- /dev/null
+++ b/fs/nfsd/filecache.h
@@ -0,0 +1,61 @@
+#ifndef _FS_NFSD_FILECACHE_H
+#define _FS_NFSD_FILECACHE_H
+
+#include <linux/fsnotify_backend.h>
+
+/*
+ * This is the fsnotify_mark container that nfsd attaches to the files that it
+ * is holding open. Note that we have a separate refcount here aside from the
+ * one in the fsnotify_mark. We only want a single fsnotify_mark attached to
+ * the inode, and for each nfsd_file to hold a reference to it.
+ *
+ * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us
+ * how to put that reference. If there are still outstanding nfsd_files that
+ * reference the mark, then we would want to call fsnotify_put_mark on it.
+ * If there were not, then we'd need to call fsnotify_destroy_mark. Since we
+ * can't really tell the difference, we use the nfm_mark to keep track of how
+ * many nfsd_files hold references to the mark. When that counter goes to zero
+ * then we know to call fsnotify_destroy_mark on it.
+ */
+struct nfsd_file_mark {
+ struct fsnotify_mark nfm_mark;
+ atomic_t nfm_ref;
+};
+
+/*
+ * A representation of a file that has been opened by knfsd. These are hashed
+ * in the hashtable by inode pointer value. Note that this object doesn't
+ * hold a reference to the inode by itself, so the nf_inode pointer should
+ * never be dereferenced, only used for comparison.
+ */
+struct nfsd_file {
+ struct hlist_node nf_node;
+ struct list_head nf_lru;
+ struct rcu_head nf_rcu;
+ struct file *nf_file;
+ const struct cred *nf_cred;
+ struct net *nf_net;
+#define NFSD_FILE_HASHED (0)
+#define NFSD_FILE_PENDING (1)
+#define NFSD_FILE_BREAK_READ (2)
+#define NFSD_FILE_BREAK_WRITE (3)
+#define NFSD_FILE_REFERENCED (4)
+ unsigned long nf_flags;
+ struct inode *nf_inode;
+ unsigned int nf_hashval;
+ atomic_t nf_ref;
+ unsigned char nf_may;
+ struct nfsd_file_mark *nf_mark;
+};
+
+int nfsd_file_cache_init(void);
+void nfsd_file_cache_purge(struct net *);
+void nfsd_file_cache_shutdown(void);
+void nfsd_file_put(struct nfsd_file *nf);
+struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
+void nfsd_file_close_inode_sync(struct inode *inode);
+bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
+int nfsd_file_cache_stats_open(struct inode *, struct file *);
+#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index bdfe5bcb3dcd..9a4ef815fb8c 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -104,6 +104,7 @@ struct nfsd_net {
/* Time of server startup */
struct timespec64 nfssvc_boot;
+ seqlock_t boot_lock;
/*
* Max number of connections this nfsd container will allow. Defaults
@@ -179,4 +180,7 @@ struct nfsd_net {
extern void nfsd_netns_free_versions(struct nfsd_net *nn);
extern unsigned int nfsd_net_id;
+
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
+void nfsd_reset_boot_verifier(struct nfsd_net *nn);
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9bc32af4e2da..cea68d8411ac 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -172,13 +172,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
nfserr = nfsd_read(rqstp, &resp->fh,
argp->offset,
rqstp->rq_vec, argp->vlen,
- &resp->count);
- if (nfserr == 0) {
- struct inode *inode = d_inode(resp->fh.fh_dentry);
- resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
- inode->i_size);
- }
-
+ &resp->count,
+ &resp->eof);
RETURN_STATUS(nfserr);
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index fcf31822c74c..86e5658651f1 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -27,6 +27,7 @@ static u32 nfs3_ftypes[] = {
NF3SOCK, NF3BAD, NF3LNK, NF3BAD,
};
+
/*
* XDR functions for basic NFS types
*/
@@ -751,14 +752,16 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_writeres *resp = rqstp->rq_resp;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 verf[2];
p = encode_wcc_data(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->count);
*p++ = htonl(resp->committed);
/* unique identifier, y2038 overflow can be ignored */
- *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_nsec);
+ nfsd_copy_boot_verifier(verf, nn);
+ *p++ = verf[0];
+ *p++ = verf[1];
}
return xdr_ressize_check(rqstp, p);
}
@@ -1125,13 +1128,15 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_commitres *resp = rqstp->rq_resp;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 verf[2];
p = encode_wcc_data(rqstp, p, &resp->fh);
/* Write verifier */
if (resp->status == 0) {
/* unique identifier, y2038 overflow can be ignored */
- *p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
- *p++ = htonl(nn->nfssvc_boot.tv_nsec);
+ nfsd_copy_boot_verifier(verf, nn);
+ *p++ = verf[0];
+ *p++ = verf[1];
}
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 397eb7820929..524111420b48 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -512,11 +512,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
if (unlikely(status))
return status;
- if (cb != NULL) {
- status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_seq_status))
- return status;
- }
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
}
@@ -604,11 +602,10 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
if (unlikely(status))
return status;
- if (cb) {
- status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_seq_status))
- return status;
- }
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
}
#endif /* CONFIG_NFSD_PNFS */
@@ -663,11 +660,10 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
if (unlikely(status))
return status;
- if (cb) {
- status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_seq_status))
- return status;
- }
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status);
}
@@ -759,11 +755,10 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp,
if (unlikely(status))
return status;
- if (cb) {
- status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_seq_status))
- return status;
- }
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status);
}
/*
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index a79e24b79095..2681c70283ce 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -169,8 +169,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
spin_unlock(&fp->fi_lock);
if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
- vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
- fput(ls->ls_file);
+ vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
+ nfsd_file_put(ls->ls_file);
if (ls->ls_recalled)
atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
@@ -197,7 +197,7 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
fl->fl_end = OFFSET_MAX;
fl->fl_owner = ls;
fl->fl_pid = current->tgid;
- fl->fl_file = ls->ls_file;
+ fl->fl_file = ls->ls_file->nf_file;
status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
if (status) {
@@ -236,13 +236,13 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
NFSPROC4_CLNT_CB_LAYOUT);
if (parent->sc_type == NFS4_DELEG_STID)
- ls->ls_file = get_file(fp->fi_deleg_file);
+ ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
else
ls->ls_file = find_any_file(fp);
BUG_ON(!ls->ls_file);
if (nfsd4_layout_setlease(ls)) {
- fput(ls->ls_file);
+ nfsd_file_put(ls->ls_file);
put_nfs4_file(fp);
kmem_cache_free(nfs4_layout_stateid_cache, ls);
return NULL;
@@ -626,7 +626,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
argv[0] = (char *)nfsd_recall_failed;
argv[1] = addr_str;
- argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+ argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
argv[3] = NULL;
error = call_usermodehelper(nfsd_recall_failed, argv, envp,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8beda999e134..4e3e77b76411 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -568,17 +568,11 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
{
- __be32 verf[2];
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ __be32 *verf = (__be32 *)verifier->data;
- /*
- * This is opaque to client, so no need to byte-swap. Use
- * __force to keep sparse happy. y2038 time_t overflow is
- * irrelevant in this usage.
- */
- verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
- verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
- memcpy(verifier->data, verf, sizeof(verifier->data));
+ BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
+
+ nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
}
static __be32
@@ -761,7 +755,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_read *read = &u->read;
__be32 status;
- read->rd_filp = NULL;
+ read->rd_nf = NULL;
if (read->rd_offset >= OFFSET_MAX)
return nfserr_inval;
@@ -782,7 +776,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* check stateid */
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&read->rd_stateid, RD_STATE,
- &read->rd_filp, &read->rd_tmp_file);
+ &read->rd_nf);
if (status) {
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
@@ -798,8 +792,8 @@ out:
static void
nfsd4_read_release(union nfsd4_op_u *u)
{
- if (u->read.rd_filp)
- fput(u->read.rd_filp);
+ if (u->read.rd_nf)
+ nfsd_file_put(u->read.rd_nf);
trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
u->read.rd_offset, u->read.rd_length);
}
@@ -954,7 +948,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
status = nfs4_preprocess_stateid_op(rqstp, cstate,
&cstate->current_fh, &setattr->sa_stateid,
- WR_STATE, NULL, NULL);
+ WR_STATE, NULL);
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
return status;
@@ -993,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct nfsd4_write *write = &u->write;
stateid_t *stateid = &write->wr_stateid;
- struct file *filp = NULL;
+ struct nfsd_file *nf = NULL;
__be32 status = nfs_ok;
unsigned long cnt;
int nvecs;
@@ -1005,7 +999,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
trace_nfsd_write_start(rqstp, &cstate->current_fh,
write->wr_offset, cnt);
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
- stateid, WR_STATE, &filp, NULL);
+ stateid, WR_STATE, &nf);
if (status) {
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
@@ -1018,10 +1012,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&write->wr_head, write->wr_buflen);
WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
- status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
+ status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf->nf_file,
write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
write->wr_how_written);
- fput(filp);
+ nfsd_file_put(nf);
write->wr_bytes_written = cnt;
trace_nfsd_write_done(rqstp, &cstate->current_fh,
@@ -1031,8 +1025,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static __be32
nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- stateid_t *src_stateid, struct file **src,
- stateid_t *dst_stateid, struct file **dst)
+ stateid_t *src_stateid, struct nfsd_file **src,
+ stateid_t *dst_stateid, struct nfsd_file **dst)
{
__be32 status;
@@ -1040,22 +1034,22 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_nofilehandle;
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
- src_stateid, RD_STATE, src, NULL);
+ src_stateid, RD_STATE, src);
if (status) {
dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
goto out;
}
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
- dst_stateid, WR_STATE, dst, NULL);
+ dst_stateid, WR_STATE, dst);
if (status) {
dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
goto out_put_src;
}
/* fix up for NFS-specific error code */
- if (!S_ISREG(file_inode(*src)->i_mode) ||
- !S_ISREG(file_inode(*dst)->i_mode)) {
+ if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
+ !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) {
status = nfserr_wrong_type;
goto out_put_dst;
}
@@ -1063,9 +1057,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
out:
return status;
out_put_dst:
- fput(*dst);
+ nfsd_file_put(*dst);
out_put_src:
- fput(*src);
+ nfsd_file_put(*src);
goto out;
}
@@ -1074,7 +1068,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_clone *clone = &u->clone;
- struct file *src, *dst;
+ struct nfsd_file *src, *dst;
__be32 status;
status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
@@ -1082,11 +1076,11 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- status = nfsd4_clone_file_range(src, clone->cl_src_pos,
- dst, clone->cl_dst_pos, clone->cl_count);
+ status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos,
+ dst->nf_file, clone->cl_dst_pos, clone->cl_count);
- fput(dst);
- fput(src);
+ nfsd_file_put(dst);
+ nfsd_file_put(src);
out:
return status;
}
@@ -1176,8 +1170,9 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
do {
if (kthread_should_stop())
break;
- bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos,
- copy->file_dst, dst_pos, bytes_total);
+ bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file,
+ src_pos, copy->nf_dst->nf_file, dst_pos,
+ bytes_total);
if (bytes_copied <= 0)
break;
bytes_total -= bytes_copied;
@@ -1204,8 +1199,8 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
status = nfs_ok;
}
- fput(copy->file_src);
- fput(copy->file_dst);
+ nfsd_file_put(copy->nf_src);
+ nfsd_file_put(copy->nf_dst);
return status;
}
@@ -1218,16 +1213,16 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
memcpy(&dst->fh, &src->fh, sizeof(src->fh));
dst->cp_clp = src->cp_clp;
- dst->file_dst = get_file(src->file_dst);
- dst->file_src = get_file(src->file_src);
+ dst->nf_dst = nfsd_file_get(src->nf_dst);
+ dst->nf_src = nfsd_file_get(src->nf_src);
memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
}
static void cleanup_async_copy(struct nfsd4_copy *copy)
{
nfs4_free_cp_state(copy);
- fput(copy->file_dst);
- fput(copy->file_src);
+ nfsd_file_put(copy->nf_dst);
+ nfsd_file_put(copy->nf_src);
spin_lock(&copy->cp_clp->async_lock);
list_del(&copy->copies);
spin_unlock(&copy->cp_clp->async_lock);
@@ -1264,8 +1259,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_copy *async_copy = NULL;
status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid,
- &copy->file_src, &copy->cp_dst_stateid,
- &copy->file_dst);
+ &copy->nf_src, &copy->cp_dst_stateid,
+ &copy->nf_dst);
if (status)
goto out;
@@ -1347,21 +1342,21 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_fallocate *fallocate, int flags)
{
__be32 status;
- struct file *file;
+ struct nfsd_file *nf;
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&fallocate->falloc_stateid,
- WR_STATE, &file, NULL);
+ WR_STATE, &nf);
if (status != nfs_ok) {
dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
return status;
}
- status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
+ status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
fallocate->falloc_offset,
fallocate->falloc_length,
flags);
- fput(file);
+ nfsd_file_put(nf);
return status;
}
static __be32
@@ -1406,11 +1401,11 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_seek *seek = &u->seek;
int whence;
__be32 status;
- struct file *file;
+ struct nfsd_file *nf;
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&seek->seek_stateid,
- RD_STATE, &file, NULL);
+ RD_STATE, &nf);
if (status) {
dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
return status;
@@ -1432,14 +1427,14 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* Note: This call does change file->f_pos, but nothing in NFSD
* should ever file->f_pos.
*/
- seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+ seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence);
if (seek->seek_pos < 0)
status = nfserrno(seek->seek_pos);
- else if (seek->seek_pos >= i_size_read(file_inode(file)))
+ else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file)))
seek->seek_eof = true;
out:
- fput(file);
+ nfsd_file_put(nf);
return status;
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 87679557d0d6..cdc75ad4438b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -59,8 +59,13 @@ struct nfsd4_client_tracking_ops {
void (*remove)(struct nfs4_client *);
int (*check)(struct nfs4_client *);
void (*grace_done)(struct nfsd_net *);
+ uint8_t version;
+ size_t msglen;
};
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops;
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2;
+
/* Globals */
static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
@@ -173,6 +178,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp,
const char *dname, int len, struct nfsd_net *nn)
{
struct xdr_netobj name;
+ struct xdr_netobj princhash = { .len = 0, .data = NULL };
struct nfs4_client_reclaim *crp;
name.data = kmemdup(dname, len, GFP_KERNEL);
@@ -182,7 +188,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp,
return;
}
name.len = len;
- crp = nfs4_client_to_reclaim(name, nn);
+ crp = nfs4_client_to_reclaim(name, princhash, nn);
if (!crp) {
kfree(name.data);
return;
@@ -482,6 +488,7 @@ static int
load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
{
struct xdr_netobj name;
+ struct xdr_netobj princhash = { .len = 0, .data = NULL };
if (child->d_name.len != HEXDIR_LEN - 1) {
printk("%s: illegal name %pd in recovery directory\n",
@@ -496,7 +503,7 @@ load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
goto out;
}
name.len = HEXDIR_LEN;
- if (!nfs4_client_to_reclaim(name, nn))
+ if (!nfs4_client_to_reclaim(name, princhash, nn))
kfree(name.data);
out:
return 0;
@@ -718,6 +725,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
.remove = nfsd4_remove_clid_dir,
.check = nfsd4_check_legacy_client,
.grace_done = nfsd4_recdir_purge_old,
+ .version = 1,
+ .msglen = 0,
};
/* Globals */
@@ -731,25 +740,32 @@ struct cld_net {
struct list_head cn_list;
unsigned int cn_xid;
bool cn_has_legacy;
+ struct crypto_shash *cn_tfm;
};
struct cld_upcall {
struct list_head cu_list;
struct cld_net *cu_net;
struct completion cu_done;
- struct cld_msg cu_msg;
+ union {
+ struct cld_msg_hdr cu_hdr;
+ struct cld_msg cu_msg;
+ struct cld_msg_v2 cu_msg_v2;
+ } cu_u;
};
static int
-__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
{
int ret;
struct rpc_pipe_msg msg;
- struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg);
+ struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u);
+ struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info,
+ nfsd_net_id);
memset(&msg, 0, sizeof(msg));
msg.data = cmsg;
- msg.len = sizeof(*cmsg);
+ msg.len = nn->client_tracking_ops->msglen;
ret = rpc_queue_upcall(pipe, &msg);
if (ret < 0) {
@@ -765,7 +781,7 @@ out:
}
static int
-cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
{
int ret;
@@ -781,11 +797,11 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
}
static ssize_t
-__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
+__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
struct nfsd_net *nn)
{
- uint8_t cmd;
- struct xdr_netobj name;
+ uint8_t cmd, princhashlen;
+ struct xdr_netobj name, princhash = { .len = 0, .data = NULL };
uint16_t namelen;
struct cld_net *cn = nn->cld_net;
@@ -794,22 +810,48 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
return -EFAULT;
}
if (cmd == Cld_GraceStart) {
- if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len))
- return -EFAULT;
- name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen);
- if (IS_ERR_OR_NULL(name.data))
- return -EFAULT;
- name.len = namelen;
+ if (nn->client_tracking_ops->version >= 2) {
+ const struct cld_clntinfo __user *ci;
+
+ ci = &cmsg->cm_u.cm_clntinfo;
+ if (get_user(namelen, &ci->cc_name.cn_len))
+ return -EFAULT;
+ name.data = memdup_user(&ci->cc_name.cn_id, namelen);
+ if (IS_ERR_OR_NULL(name.data))
+ return -EFAULT;
+ name.len = namelen;
+ get_user(princhashlen, &ci->cc_princhash.cp_len);
+ if (princhashlen > 0) {
+ princhash.data = memdup_user(
+ &ci->cc_princhash.cp_data,
+ princhashlen);
+ if (IS_ERR_OR_NULL(princhash.data))
+ return -EFAULT;
+ princhash.len = princhashlen;
+ } else
+ princhash.len = 0;
+ } else {
+ const struct cld_name __user *cnm;
+
+ cnm = &cmsg->cm_u.cm_name;
+ if (get_user(namelen, &cnm->cn_len))
+ return -EFAULT;
+ name.data = memdup_user(&cnm->cn_id, namelen);
+ if (IS_ERR_OR_NULL(name.data))
+ return -EFAULT;
+ name.len = namelen;
+ }
if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
name.len = name.len - 5;
memmove(name.data, name.data + 5, name.len);
cn->cn_has_legacy = true;
}
- if (!nfs4_client_to_reclaim(name, nn)) {
+ if (!nfs4_client_to_reclaim(name, princhash, nn)) {
kfree(name.data);
+ kfree(princhash.data);
return -EFAULT;
}
- return sizeof(*cmsg);
+ return nn->client_tracking_ops->msglen;
}
return -EFAULT;
}
@@ -818,21 +860,22 @@ static ssize_t
cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
struct cld_upcall *tmp, *cup;
- struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
+ struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src;
+ struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src;
uint32_t xid;
struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
nfsd_net_id);
struct cld_net *cn = nn->cld_net;
int16_t status;
- if (mlen != sizeof(*cmsg)) {
+ if (mlen != nn->client_tracking_ops->msglen) {
dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
- sizeof(*cmsg));
+ nn->client_tracking_ops->msglen);
return -EINVAL;
}
/* copy just the xid so we can try to find that */
- if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) {
+ if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) {
dprintk("%s: error when copying xid from userspace", __func__);
return -EFAULT;
}
@@ -842,7 +885,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
* list (for -EINPROGRESS, we just want to make sure the xid is
* valid, not remove the upcall from the list)
*/
- if (get_user(status, &cmsg->cm_status)) {
+ if (get_user(status, &hdr->cm_status)) {
dprintk("%s: error when copying status from userspace", __func__);
return -EFAULT;
}
@@ -851,7 +894,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
cup = NULL;
spin_lock(&cn->cn_lock);
list_for_each_entry(tmp, &cn->cn_list, cu_list) {
- if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
+ if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) {
cup = tmp;
if (status != -EINPROGRESS)
list_del_init(&cup->cu_list);
@@ -869,7 +912,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (status == -EINPROGRESS)
return __cld_pipe_inprogress_downcall(cmsg, nn);
- if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
+ if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0)
return -EFAULT;
complete(&cup->cu_done);
@@ -881,7 +924,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
{
struct cld_msg *cmsg = msg->data;
struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
- cu_msg);
+ cu_u.cu_msg);
/* errno >= 0 means we got a downcall */
if (msg->errno >= 0)
@@ -1007,14 +1050,17 @@ nfsd4_remove_cld_pipe(struct net *net)
nfsd4_cld_unregister_net(net, cn->cn_pipe);
rpc_destroy_pipe_data(cn->cn_pipe);
+ if (cn->cn_tfm)
+ crypto_free_shash(cn->cn_tfm);
kfree(nn->cld_net);
nn->cld_net = NULL;
}
static struct cld_upcall *
-alloc_cld_upcall(struct cld_net *cn)
+alloc_cld_upcall(struct nfsd_net *nn)
{
struct cld_upcall *new, *tmp;
+ struct cld_net *cn = nn->cld_net;
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
@@ -1024,20 +1070,20 @@ alloc_cld_upcall(struct cld_net *cn)
restart_search:
spin_lock(&cn->cn_lock);
list_for_each_entry(tmp, &cn->cn_list, cu_list) {
- if (tmp->cu_msg.cm_xid == cn->cn_xid) {
+ if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) {
cn->cn_xid++;
spin_unlock(&cn->cn_lock);
goto restart_search;
}
}
init_completion(&new->cu_done);
- new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
- put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
+ new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version;
+ put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid);
new->cu_net = cn;
list_add(&new->cu_list, &cn->cn_list);
spin_unlock(&cn->cn_lock);
- dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid);
+ dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid);
return new;
}
@@ -1066,20 +1112,20 @@ nfsd4_cld_create(struct nfs4_client *clp)
if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return;
- cup = alloc_cld_upcall(cn);
+ cup = alloc_cld_upcall(nn);
if (!cup) {
ret = -ENOMEM;
goto out_err;
}
- cup->cu_msg.cm_cmd = Cld_Create;
- cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
- memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ cup->cu_u.cu_msg.cm_cmd = Cld_Create;
+ cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
clp->cl_name.len);
- ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
if (!ret) {
- ret = cup->cu_msg.cm_status;
+ ret = cup->cu_u.cu_msg.cm_status;
set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
}
@@ -1092,6 +1138,75 @@ out_err:
/* Ask daemon to create a new record */
static void
+nfsd4_cld_create_v2(struct nfs4_client *clp)
+{
+ int ret;
+ struct cld_upcall *cup;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ struct cld_msg_v2 *cmsg;
+ struct crypto_shash *tfm = cn->cn_tfm;
+ struct xdr_netobj cksum;
+ char *principal = NULL;
+ SHASH_DESC_ON_STACK(desc, tfm);
+
+ /* Don't upcall if it's already stored */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ cmsg = &cup->cu_u.cu_msg_v2;
+ cmsg->cm_cmd = Cld_Create;
+ cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len;
+ memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data,
+ clp->cl_name.len);
+ if (clp->cl_cred.cr_raw_principal)
+ principal = clp->cl_cred.cr_raw_principal;
+ else if (clp->cl_cred.cr_principal)
+ principal = clp->cl_cred.cr_principal;
+ if (principal) {
+ desc->tfm = tfm;
+ cksum.len = crypto_shash_digestsize(tfm);
+ cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+ if (cksum.data == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = crypto_shash_digest(desc, principal, strlen(principal),
+ cksum.data);
+ shash_desc_zero(desc);
+ if (ret) {
+ kfree(cksum.data);
+ goto out;
+ }
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len;
+ memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data,
+ cksum.data, cksum.len);
+ kfree(cksum.data);
+ } else
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0;
+
+ ret = cld_pipe_upcall(cn->cn_pipe, cmsg);
+ if (!ret) {
+ ret = cmsg->cm_status;
+ set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ }
+
+out:
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ pr_err("NFSD: Unable to create client record on stable storage: %d\n",
+ ret);
+}
+
+/* Ask daemon to create a new record */
+static void
nfsd4_cld_remove(struct nfs4_client *clp)
{
int ret;
@@ -1103,20 +1218,20 @@ nfsd4_cld_remove(struct nfs4_client *clp)
if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return;
- cup = alloc_cld_upcall(cn);
+ cup = alloc_cld_upcall(nn);
if (!cup) {
ret = -ENOMEM;
goto out_err;
}
- cup->cu_msg.cm_cmd = Cld_Remove;
- cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
- memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ cup->cu_u.cu_msg.cm_cmd = Cld_Remove;
+ cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
clp->cl_name.len);
- ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
if (!ret) {
- ret = cup->cu_msg.cm_status;
+ ret = cup->cu_u.cu_msg.cm_status;
clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
}
@@ -1145,21 +1260,21 @@ nfsd4_cld_check_v0(struct nfs4_client *clp)
if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return 0;
- cup = alloc_cld_upcall(cn);
+ cup = alloc_cld_upcall(nn);
if (!cup) {
printk(KERN_ERR "NFSD: Unable to check client record on "
"stable storage: %d\n", -ENOMEM);
return -ENOMEM;
}
- cup->cu_msg.cm_cmd = Cld_Check;
- cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
- memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+ cup->cu_u.cu_msg.cm_cmd = Cld_Check;
+ cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+ memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
clp->cl_name.len);
- ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
if (!ret) {
- ret = cup->cu_msg.cm_status;
+ ret = cup->cu_u.cu_msg.cm_status;
set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
}
@@ -1217,22 +1332,95 @@ found:
}
static int
+nfsd4_cld_check_v2(struct nfs4_client *clp)
+{
+ struct nfs4_client_reclaim *crp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ struct cld_net *cn = nn->cld_net;
+ int status;
+ char dname[HEXDIR_LEN];
+ struct xdr_netobj name;
+ struct crypto_shash *tfm = cn->cn_tfm;
+ struct xdr_netobj cksum;
+ char *principal = NULL;
+ SHASH_DESC_ON_STACK(desc, tfm);
+
+ /* did we already find that this client is stable? */
+ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ return 0;
+
+ /* look for it in the reclaim hashtable otherwise */
+ crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+ if (crp)
+ goto found;
+
+ if (cn->cn_has_legacy) {
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return -ENOENT;
+
+ name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+ if (!name.data) {
+ dprintk("%s: failed to allocate memory for name.data\n",
+ __func__);
+ return -ENOENT;
+ }
+ name.len = HEXDIR_LEN;
+ crp = nfsd4_find_reclaim_client(name, nn);
+ kfree(name.data);
+ if (crp)
+ goto found;
+
+ }
+ return -ENOENT;
+found:
+ if (crp->cr_princhash.len) {
+ if (clp->cl_cred.cr_raw_principal)
+ principal = clp->cl_cred.cr_raw_principal;
+ else if (clp->cl_cred.cr_principal)
+ principal = clp->cl_cred.cr_principal;
+ if (principal == NULL)
+ return -ENOENT;
+ desc->tfm = tfm;
+ cksum.len = crypto_shash_digestsize(tfm);
+ cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+ if (cksum.data == NULL)
+ return -ENOENT;
+ status = crypto_shash_digest(desc, principal, strlen(principal),
+ cksum.data);
+ shash_desc_zero(desc);
+ if (status) {
+ kfree(cksum.data);
+ return -ENOENT;
+ }
+ if (memcmp(crp->cr_princhash.data, cksum.data,
+ crp->cr_princhash.len)) {
+ kfree(cksum.data);
+ return -ENOENT;
+ }
+ kfree(cksum.data);
+ }
+ crp->cr_clp = clp;
+ return 0;
+}
+
+static int
nfsd4_cld_grace_start(struct nfsd_net *nn)
{
int ret;
struct cld_upcall *cup;
struct cld_net *cn = nn->cld_net;
- cup = alloc_cld_upcall(cn);
+ cup = alloc_cld_upcall(nn);
if (!cup) {
ret = -ENOMEM;
goto out_err;
}
- cup->cu_msg.cm_cmd = Cld_GraceStart;
- ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
if (!ret)
- ret = cup->cu_msg.cm_status;
+ ret = cup->cu_u.cu_msg.cm_status;
free_cld_upcall(cup);
out_err:
@@ -1250,17 +1438,17 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
struct cld_upcall *cup;
struct cld_net *cn = nn->cld_net;
- cup = alloc_cld_upcall(cn);
+ cup = alloc_cld_upcall(nn);
if (!cup) {
ret = -ENOMEM;
goto out_err;
}
- cup->cu_msg.cm_cmd = Cld_GraceDone;
- cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
- ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+ cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
if (!ret)
- ret = cup->cu_msg.cm_status;
+ ret = cup->cu_u.cu_msg.cm_status;
free_cld_upcall(cup);
out_err:
@@ -1279,16 +1467,16 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
struct cld_upcall *cup;
struct cld_net *cn = nn->cld_net;
- cup = alloc_cld_upcall(cn);
+ cup = alloc_cld_upcall(nn);
if (!cup) {
ret = -ENOMEM;
goto out_err;
}
- cup->cu_msg.cm_cmd = Cld_GraceDone;
- ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+ cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
if (!ret)
- ret = cup->cu_msg.cm_status;
+ ret = cup->cu_u.cu_msg.cm_status;
free_cld_upcall(cup);
out_err:
@@ -1337,6 +1525,53 @@ cld_running(struct nfsd_net *nn)
}
static int
+nfsd4_cld_get_version(struct nfsd_net *nn)
+{
+ int ret = 0;
+ struct cld_upcall *cup;
+ struct cld_net *cn = nn->cld_net;
+ uint8_t version;
+
+ cup = alloc_cld_upcall(nn);
+ if (!cup) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion;
+ ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
+ if (!ret) {
+ ret = cup->cu_u.cu_msg.cm_status;
+ if (ret)
+ goto out_free;
+ version = cup->cu_u.cu_msg.cm_u.cm_version;
+ dprintk("%s: userspace returned version %u\n",
+ __func__, version);
+ if (version < 1)
+ version = 1;
+ else if (version > CLD_UPCALL_VERSION)
+ version = CLD_UPCALL_VERSION;
+
+ switch (version) {
+ case 1:
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+ break;
+ case 2:
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2;
+ break;
+ default:
+ break;
+ }
+ }
+out_free:
+ free_cld_upcall(cup);
+out_err:
+ if (ret)
+ dprintk("%s: Unable to get version from userspace: %d\n",
+ __func__, ret);
+ return ret;
+}
+
+static int
nfsd4_cld_tracking_init(struct net *net)
{
int status;
@@ -1351,6 +1586,11 @@ nfsd4_cld_tracking_init(struct net *net)
status = __nfsd4_init_cld_pipe(net);
if (status)
goto err_shutdown;
+ nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(nn->cld_net->cn_tfm)) {
+ status = PTR_ERR(nn->cld_net->cn_tfm);
+ goto err_remove;
+ }
/*
* rpc pipe upcalls take 30 seconds to time out, so we don't want to
@@ -1368,10 +1608,14 @@ nfsd4_cld_tracking_init(struct net *net)
goto err_remove;
}
+ status = nfsd4_cld_get_version(nn);
+ if (status == -EOPNOTSUPP)
+ pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n");
+
status = nfsd4_cld_grace_start(nn);
if (status) {
if (status == -EOPNOTSUPP)
- printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n");
+ pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n");
nfs4_release_reclaim(nn);
goto err_remove;
} else
@@ -1403,6 +1647,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
.remove = nfsd4_cld_remove,
.check = nfsd4_cld_check_v0,
.grace_done = nfsd4_cld_grace_done_v0,
+ .version = 1,
+ .msglen = sizeof(struct cld_msg),
};
/* For newer nfsdcld's */
@@ -1413,6 +1659,20 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
.remove = nfsd4_cld_remove,
.check = nfsd4_cld_check,
.grace_done = nfsd4_cld_grace_done,
+ .version = 1,
+ .msglen = sizeof(struct cld_msg),
+};
+
+/* v2 create/check ops include the principal, if available */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = {
+ .init = nfsd4_cld_tracking_init,
+ .exit = nfsd4_cld_tracking_exit,
+ .create = nfsd4_cld_create_v2,
+ .remove = nfsd4_cld_remove,
+ .check = nfsd4_cld_check_v2,
+ .grace_done = nfsd4_cld_grace_done,
+ .version = 2,
+ .msglen = sizeof(struct cld_msg_v2),
};
/* upcall via usermodehelper */
@@ -1760,6 +2020,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
.remove = nfsd4_umh_cltrack_remove,
.check = nfsd4_umh_cltrack_check,
.grace_done = nfsd4_umh_cltrack_grace_done,
+ .version = 1,
+ .msglen = 0,
};
int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7857942c5ca6..c65aeaa812d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -50,6 +50,7 @@
#include "netns.h"
#include "pnfs.h"
+#include "filecache.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -429,18 +430,18 @@ put_nfs4_file(struct nfs4_file *fi)
}
}
-static struct file *
+static struct nfsd_file *
__nfs4_get_fd(struct nfs4_file *f, int oflag)
{
if (f->fi_fds[oflag])
- return get_file(f->fi_fds[oflag]);
+ return nfsd_file_get(f->fi_fds[oflag]);
return NULL;
}
-static struct file *
+static struct nfsd_file *
find_writeable_file_locked(struct nfs4_file *f)
{
- struct file *ret;
+ struct nfsd_file *ret;
lockdep_assert_held(&f->fi_lock);
@@ -450,10 +451,10 @@ find_writeable_file_locked(struct nfs4_file *f)
return ret;
}
-static struct file *
+static struct nfsd_file *
find_writeable_file(struct nfs4_file *f)
{
- struct file *ret;
+ struct nfsd_file *ret;
spin_lock(&f->fi_lock);
ret = find_writeable_file_locked(f);
@@ -462,9 +463,10 @@ find_writeable_file(struct nfs4_file *f)
return ret;
}
-static struct file *find_readable_file_locked(struct nfs4_file *f)
+static struct nfsd_file *
+find_readable_file_locked(struct nfs4_file *f)
{
- struct file *ret;
+ struct nfsd_file *ret;
lockdep_assert_held(&f->fi_lock);
@@ -474,10 +476,10 @@ static struct file *find_readable_file_locked(struct nfs4_file *f)
return ret;
}
-static struct file *
+static struct nfsd_file *
find_readable_file(struct nfs4_file *f)
{
- struct file *ret;
+ struct nfsd_file *ret;
spin_lock(&f->fi_lock);
ret = find_readable_file_locked(f);
@@ -486,10 +488,10 @@ find_readable_file(struct nfs4_file *f)
return ret;
}
-struct file *
+struct nfsd_file *
find_any_file(struct nfs4_file *f)
{
- struct file *ret;
+ struct nfsd_file *ret;
spin_lock(&f->fi_lock);
ret = __nfs4_get_fd(f, O_RDWR);
@@ -590,17 +592,17 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
might_lock(&fp->fi_lock);
if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
- struct file *f1 = NULL;
- struct file *f2 = NULL;
+ struct nfsd_file *f1 = NULL;
+ struct nfsd_file *f2 = NULL;
swap(f1, fp->fi_fds[oflag]);
if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
swap(f2, fp->fi_fds[O_RDWR]);
spin_unlock(&fp->fi_lock);
if (f1)
- fput(f1);
+ nfsd_file_put(f1);
if (f2)
- fput(f2);
+ nfsd_file_put(f2);
}
}
@@ -933,25 +935,25 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
static void put_deleg_file(struct nfs4_file *fp)
{
- struct file *filp = NULL;
+ struct nfsd_file *nf = NULL;
spin_lock(&fp->fi_lock);
if (--fp->fi_delegees == 0)
- swap(filp, fp->fi_deleg_file);
+ swap(nf, fp->fi_deleg_file);
spin_unlock(&fp->fi_lock);
- if (filp)
- fput(filp);
+ if (nf)
+ nfsd_file_put(nf);
}
static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
- struct file *filp = fp->fi_deleg_file;
+ struct nfsd_file *nf = fp->fi_deleg_file;
WARN_ON_ONCE(!fp->fi_delegees);
- vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp);
+ vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
put_deleg_file(fp);
}
@@ -1289,11 +1291,14 @@ static void nfs4_free_lock_stateid(struct nfs4_stid *stid)
{
struct nfs4_ol_stateid *stp = openlockstateid(stid);
struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
- struct file *file;
+ struct nfsd_file *nf;
- file = find_any_file(stp->st_stid.sc_file);
- if (file)
- filp_close(file, (fl_owner_t)lo);
+ nf = find_any_file(stp->st_stid.sc_file);
+ if (nf) {
+ get_file(nf->nf_file);
+ filp_close(nf->nf_file, (fl_owner_t)lo);
+ nfsd_file_put(nf);
+ }
nfs4_free_ol_stateid(stid);
}
@@ -1563,21 +1568,39 @@ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
* re-negotiate active sessions and reduce their slot usage to make
* room for new connections. For now we just fail the create session.
*/
-static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
{
u32 slotsize = slot_bytes(ca);
u32 num = ca->maxreqs;
unsigned long avail, total_avail;
+ unsigned int scale_factor;
spin_lock(&nfsd_drc_lock);
- total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+ if (nfsd_drc_max_mem > nfsd_drc_mem_used)
+ total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+ else
+ /* We have handed out more space than we chose in
+ * set_max_drc() to allow. That isn't really a
+ * problem as long as that doesn't make us think we
+ * have lots more due to integer overflow.
+ */
+ total_avail = 0;
avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
/*
- * Never use more than a third of the remaining memory,
- * unless it's the only way to give this client a slot:
+ * Never use more than a fraction of the remaining memory,
+ * unless it's the only way to give this client a slot.
+ * The chosen fraction is either 1/8 or 1/number of threads,
+ * whichever is smaller. This ensures there are adequate
+ * slots to support multiple clients per thread.
+ * Give the client one slot even if that would require
+ * over-allocation--it is better than failure.
*/
- avail = clamp_t(unsigned long, avail, slotsize, total_avail/3);
+ scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads);
+
+ avail = clamp_t(unsigned long, avail, slotsize,
+ total_avail/scale_factor);
num = min_t(int, num, avail / slotsize);
+ num = max_t(int, num, 1);
nfsd_drc_mem_used += num * slotsize;
spin_unlock(&nfsd_drc_lock);
@@ -2323,9 +2346,9 @@ static void states_stop(struct seq_file *s, void *v)
spin_unlock(&clp->cl_lock);
}
-static void nfs4_show_superblock(struct seq_file *s, struct file *f)
+static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
{
- struct inode *inode = file_inode(f);
+ struct inode *inode = f->nf_inode;
seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
MAJOR(inode->i_sb->s_dev),
@@ -2343,7 +2366,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
{
struct nfs4_ol_stateid *ols;
struct nfs4_file *nf;
- struct file *file;
+ struct nfsd_file *file;
struct nfs4_stateowner *oo;
unsigned int access, deny;
@@ -2370,7 +2393,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- fput(file);
+ nfsd_file_put(file);
return 0;
}
@@ -2379,7 +2402,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
{
struct nfs4_ol_stateid *ols;
struct nfs4_file *nf;
- struct file *file;
+ struct nfsd_file *file;
struct nfs4_stateowner *oo;
ols = openlockstateid(st);
@@ -2401,7 +2424,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- fput(file);
+ nfsd_file_put(file);
return 0;
}
@@ -2410,7 +2433,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
{
struct nfs4_delegation *ds;
struct nfs4_file *nf;
- struct file *file;
+ struct nfsd_file *file;
ds = delegstateid(st);
nf = st->sc_file;
@@ -2433,7 +2456,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
{
struct nfs4_layout_stateid *ls;
- struct file *file;
+ struct nfsd_file *file;
ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
file = ls->ls_file;
@@ -3169,10 +3192,10 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
* performance. When short on memory we therefore prefer to
* decrease number of slots instead of their size. Clients that
* request larger slots than they need will get poor results:
+ * Note that we always allow at least one slot, because our
+ * accounting is soft and provides no guarantees either way.
*/
- ca->maxreqs = nfsd4_get_drc_mem(ca);
- if (!ca->maxreqs)
- return nfserr_jukebox;
+ ca->maxreqs = nfsd4_get_drc_mem(ca, nn);
return nfs_ok;
}
@@ -4651,7 +4674,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
struct nfsd4_open *open)
{
- struct file *filp = NULL;
+ struct nfsd_file *nf = NULL;
__be32 status;
int oflag = nfs4_access_to_omode(open->op_share_access);
int access = nfs4_access_to_access(open->op_share_access);
@@ -4687,18 +4710,18 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
if (!fp->fi_fds[oflag]) {
spin_unlock(&fp->fi_lock);
- status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp);
+ status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
if (status)
goto out_put_access;
spin_lock(&fp->fi_lock);
if (!fp->fi_fds[oflag]) {
- fp->fi_fds[oflag] = filp;
- filp = NULL;
+ fp->fi_fds[oflag] = nf;
+ nf = NULL;
}
}
spin_unlock(&fp->fi_lock);
- if (filp)
- fput(filp);
+ if (nf)
+ nfsd_file_put(nf);
status = nfsd4_truncate(rqstp, cur_fh, open);
if (status)
@@ -4767,7 +4790,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
fl->fl_end = OFFSET_MAX;
fl->fl_owner = (fl_owner_t)dp;
fl->fl_pid = current->tgid;
- fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file;
+ fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
return fl;
}
@@ -4777,7 +4800,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
{
int status = 0;
struct nfs4_delegation *dp;
- struct file *filp;
+ struct nfsd_file *nf;
struct file_lock *fl;
/*
@@ -4788,8 +4811,8 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
- filp = find_readable_file(fp);
- if (!filp) {
+ nf = find_readable_file(fp);
+ if (!nf) {
/* We should always have a readable file here */
WARN_ON_ONCE(1);
return ERR_PTR(-EBADF);
@@ -4799,17 +4822,17 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (nfs4_delegation_exists(clp, fp))
status = -EAGAIN;
else if (!fp->fi_deleg_file) {
- fp->fi_deleg_file = filp;
+ fp->fi_deleg_file = nf;
/* increment early to prevent fi_deleg_file from being
* cleared */
fp->fi_delegees = 1;
- filp = NULL;
+ nf = NULL;
} else
fp->fi_delegees++;
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
- if (filp)
- fput(filp);
+ if (nf)
+ nfsd_file_put(nf);
if (status)
return ERR_PTR(status);
@@ -4822,7 +4845,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (!fl)
goto out_clnt_odstate;
- status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL);
+ status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
if (fl)
locks_free_lock(fl);
if (status)
@@ -4842,7 +4865,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
return dp;
out_unlock:
- vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp);
+ vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
out_clnt_odstate:
put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_stid(&dp->dl_stid);
@@ -5513,7 +5536,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
return nfs_ok;
}
-static struct file *
+static struct nfsd_file *
nfs4_find_file(struct nfs4_stid *s, int flags)
{
if (!s)
@@ -5523,7 +5546,7 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
case NFS4_DELEG_STID:
if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file))
return NULL;
- return get_file(s->sc_file->fi_deleg_file);
+ return nfsd_file_get(s->sc_file->fi_deleg_file);
case NFS4_OPEN_STID:
case NFS4_LOCK_STID:
if (flags & RD_STATE)
@@ -5549,32 +5572,28 @@ nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags)
static __be32
nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
- struct file **filpp, bool *tmp_file, int flags)
+ struct nfsd_file **nfp, int flags)
{
int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE;
- struct file *file;
+ struct nfsd_file *nf;
__be32 status;
- file = nfs4_find_file(s, flags);
- if (file) {
+ nf = nfs4_find_file(s, flags);
+ if (nf) {
status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
acc | NFSD_MAY_OWNER_OVERRIDE);
if (status) {
- fput(file);
- return status;
+ nfsd_file_put(nf);
+ goto out;
}
-
- *filpp = file;
} else {
- status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp);
+ status = nfsd_file_acquire(rqstp, fhp, acc, &nf);
if (status)
return status;
-
- if (tmp_file)
- *tmp_file = true;
}
-
- return 0;
+ *nfp = nf;
+out:
+ return status;
}
/*
@@ -5583,7 +5602,7 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
__be32
nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
- stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file)
+ stateid_t *stateid, int flags, struct nfsd_file **nfp)
{
struct inode *ino = d_inode(fhp->fh_dentry);
struct net *net = SVC_NET(rqstp);
@@ -5591,10 +5610,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
struct nfs4_stid *s = NULL;
__be32 status;
- if (filpp)
- *filpp = NULL;
- if (tmp_file)
- *tmp_file = false;
+ if (nfp)
+ *nfp = NULL;
if (grace_disallows_io(net, ino))
return nfserr_grace;
@@ -5631,8 +5648,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
status = nfs4_check_fh(fhp, s);
done:
- if (!status && filpp)
- status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags);
+ if (status == nfs_ok && nfp)
+ status = nfs4_check_file(rqstp, fhp, s, nfp, flags);
out:
if (s)
nfs4_put_stid(s);
@@ -6392,7 +6409,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfs4_ol_stateid *lock_stp = NULL;
struct nfs4_ol_stateid *open_stp = NULL;
struct nfs4_file *fp;
- struct file *filp = NULL;
+ struct nfsd_file *nf = NULL;
struct nfsd4_blocked_lock *nbl = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
@@ -6474,8 +6491,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Fallthrough */
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
- filp = find_readable_file_locked(fp);
- if (filp)
+ nf = find_readable_file_locked(fp);
+ if (nf)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
spin_unlock(&fp->fi_lock);
fl_type = F_RDLCK;
@@ -6486,8 +6503,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Fallthrough */
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
- filp = find_writeable_file_locked(fp);
- if (filp)
+ nf = find_writeable_file_locked(fp);
+ if (nf)
get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
spin_unlock(&fp->fi_lock);
fl_type = F_WRLCK;
@@ -6497,7 +6514,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- if (!filp) {
+ if (!nf) {
status = nfserr_openmode;
goto out;
}
@@ -6513,7 +6530,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
file_lock->fl_type = fl_type;
file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
file_lock->fl_pid = current->tgid;
- file_lock->fl_file = filp;
+ file_lock->fl_file = nf->nf_file;
file_lock->fl_flags = fl_flags;
file_lock->fl_lmops = &nfsd_posix_mng_ops;
file_lock->fl_start = lock->lk_offset;
@@ -6535,7 +6552,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
spin_unlock(&nn->blocked_locks_lock);
}
- err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
+ err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock);
switch (err) {
case 0: /* success! */
nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
@@ -6570,8 +6587,8 @@ out:
}
free_blocked_lock(nbl);
}
- if (filp)
- fput(filp);
+ if (nf)
+ nfsd_file_put(nf);
if (lock_stp) {
/* Bump seqid manually if the 4.0 replay owner is openowner */
if (cstate->replay_owner &&
@@ -6606,11 +6623,11 @@ out:
*/
static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
{
- struct file *file;
- __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+ struct nfsd_file *nf;
+ __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
if (!err) {
- err = nfserrno(vfs_test_lock(file, lock));
- fput(file);
+ err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+ nfsd_file_put(nf);
}
return err;
}
@@ -6698,7 +6715,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct nfsd4_locku *locku = &u->locku;
struct nfs4_ol_stateid *stp;
- struct file *filp = NULL;
+ struct nfsd_file *nf = NULL;
struct file_lock *file_lock = NULL;
__be32 status;
int err;
@@ -6716,8 +6733,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&stp, nn);
if (status)
goto out;
- filp = find_any_file(stp->st_stid.sc_file);
- if (!filp) {
+ nf = find_any_file(stp->st_stid.sc_file);
+ if (!nf) {
status = nfserr_lock_range;
goto put_stateid;
}
@@ -6725,13 +6742,13 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!file_lock) {
dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
status = nfserr_jukebox;
- goto fput;
+ goto put_file;
}
file_lock->fl_type = F_UNLCK;
file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
file_lock->fl_pid = current->tgid;
- file_lock->fl_file = filp;
+ file_lock->fl_file = nf->nf_file;
file_lock->fl_flags = FL_POSIX;
file_lock->fl_lmops = &nfsd_posix_mng_ops;
file_lock->fl_start = locku->lu_offset;
@@ -6740,14 +6757,14 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
locku->lu_length);
nfs4_transform_lock_offset(file_lock);
- err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
+ err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL);
if (err) {
dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
goto out_nfserr;
}
nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
-fput:
- fput(filp);
+put_file:
+ nfsd_file_put(nf);
put_stateid:
mutex_unlock(&stp->st_mutex);
nfs4_put_stid(&stp->st_stid);
@@ -6759,7 +6776,7 @@ out:
out_nfserr:
status = nfserrno(err);
- goto fput;
+ goto put_file;
}
/*
@@ -6772,17 +6789,17 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
{
struct file_lock *fl;
int status = false;
- struct file *filp = find_any_file(fp);
+ struct nfsd_file *nf = find_any_file(fp);
struct inode *inode;
struct file_lock_context *flctx;
- if (!filp) {
+ if (!nf) {
/* Any valid lock stateid should have some sort of access */
WARN_ON_ONCE(1);
return status;
}
- inode = locks_inode(filp);
+ inode = locks_inode(nf->nf_file);
flctx = inode->i_flctx;
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
@@ -6795,7 +6812,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
spin_unlock(&flctx->flc_lock);
}
- fput(filp);
+ nfsd_file_put(nf);
return status;
}
@@ -6888,7 +6905,8 @@ nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn)
* will be freed in nfs4_remove_reclaim_record in the normal case).
*/
struct nfs4_client_reclaim *
-nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn)
+nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
+ struct nfsd_net *nn)
{
unsigned int strhashval;
struct nfs4_client_reclaim *crp;
@@ -6901,6 +6919,8 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn)
list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
crp->cr_name.data = name.data;
crp->cr_name.len = name.len;
+ crp->cr_princhash.data = princhash.data;
+ crp->cr_princhash.len = princhash.len;
crp->cr_clp = NULL;
nn->reclaim_str_hashtbl_size++;
}
@@ -6912,6 +6932,7 @@ nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
{
list_del(&crp->cr_strhash);
kfree(crp->cr_name.data);
+ kfree(crp->cr_princhash.data);
kfree(crp);
nn->reclaim_str_hashtbl_size--;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 442811809f3d..533d0fc3c96b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -49,6 +49,7 @@
#include "cache.h"
#include "netns.h"
#include "pnfs.h"
+#include "filecache.h"
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
@@ -203,6 +204,13 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
return p;
}
+static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp)
+{
+ unsigned int this = (char *)argp->end - (char *)argp->p;
+
+ return this + argp->pagelen;
+}
+
static int zero_clientid(clientid_t *clid)
{
return (clid->cl_boot == 0) && (clid->cl_id == 0);
@@ -211,10 +219,10 @@ static int zero_clientid(clientid_t *clid)
/**
* svcxdr_tmpalloc - allocate memory to be freed after compound processing
* @argp: NFSv4 compound argument structure
- * @p: pointer to be freed (with kfree())
+ * @len: length of buffer to allocate
*
- * Marks @p to be freed when processing the compound operation
- * described in @argp finishes.
+ * Allocates a buffer of size @len to be freed when processing the compound
+ * operation described in @argp finishes.
*/
static void *
svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
@@ -347,7 +355,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
READ_BUF(4); len += 4;
nace = be32_to_cpup(p++);
- if (nace > NFS4_ACL_MAX)
+ if (nace > compoundargs_bytes_left(argp)/20)
+ /*
+ * Even with 4-byte names there wouldn't be
+ * space for that many aces; something fishy is
+ * going on:
+ */
return nfserr_fbig;
*acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
@@ -1418,7 +1431,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
struct nfsd4_create_session *sess)
{
DECODE_HEAD;
- u32 dummy;
READ_BUF(16);
COPYMEM(&sess->clientid, 8);
@@ -1427,7 +1439,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
/* Fore channel attrs */
READ_BUF(28);
- dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+ p++; /* headerpadsz is always 0 */
sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
@@ -1444,7 +1456,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
/* Back channel attrs */
READ_BUF(28);
- dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+ p++; /* headerpadsz is always 0 */
sess->back_channel.maxreq_sz = be32_to_cpup(p++);
sess->back_channel.maxresp_sz = be32_to_cpup(p++);
sess->back_channel.maxresp_cached = be32_to_cpup(p++);
@@ -1736,7 +1748,6 @@ static __be32
nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
{
DECODE_HEAD;
- unsigned int tmp;
status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
if (status)
@@ -1751,7 +1762,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
p = xdr_decode_hyper(p, &copy->cp_count);
p++; /* ca_consecutive: we always do consecutive copies */
copy->cp_synchronous = be32_to_cpup(p++);
- tmp = be32_to_cpup(p); /* Source server list not supported */
+ /* tmp = be32_to_cpup(p); Source server list not supported */
DECODE_TAIL;
}
@@ -3217,9 +3228,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
if (!p)
return nfserr_resource;
encode_cinfo(p, &create->cr_cinfo);
- nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
+ return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
create->cr_bmval[1], create->cr_bmval[2]);
- return 0;
}
static __be32
@@ -3462,7 +3472,7 @@ static __be32 nfsd4_encode_splice_read(
len = maxcount;
nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
- file, read->rd_offset, &maxcount);
+ file, read->rd_offset, &maxcount, &eof);
read->rd_length = maxcount;
if (nfserr) {
/*
@@ -3474,9 +3484,6 @@ static __be32 nfsd4_encode_splice_read(
return nfserr;
}
- eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
- d_inode(read->rd_fhp->fh_dentry)->i_size);
-
*(p++) = htonl(eof);
*(p++) = htonl(maxcount);
@@ -3547,15 +3554,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
len = maxcount;
nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
- resp->rqstp->rq_vec, read->rd_vlen, &maxcount);
+ resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
+ &eof);
read->rd_length = maxcount;
if (nfserr)
return nfserr;
xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
- eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
- d_inode(read->rd_fhp->fh_dentry)->i_size);
-
tmp = htonl(eof);
write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
tmp = htonl(maxcount);
@@ -3574,11 +3579,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
{
unsigned long maxcount;
struct xdr_stream *xdr = &resp->xdr;
- struct file *file = read->rd_filp;
+ struct file *file;
int starting_len = xdr->buf->len;
- struct raparms *ra = NULL;
__be32 *p;
+ if (nfserr)
+ return nfserr;
+ file = read->rd_nf->nf_file;
+
p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
if (!p) {
WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
@@ -3596,18 +3604,12 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
(xdr->buf->buflen - xdr->buf->len));
maxcount = min_t(unsigned long, maxcount, read->rd_length);
- if (read->rd_tmp_file)
- ra = nfsd_init_raparms(file);
-
if (file->f_op->splice_read &&
test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
- if (ra)
- nfsd_put_raparams(file, ra);
-
if (nfserr)
xdr_truncate_encode(xdr, starting_len);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2c215171c0eb..11b42c523f04 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1476,6 +1476,7 @@ static __net_init int nfsd_init_net(struct net *net)
atomic_set(&nn->ntf_refcnt, 0);
init_waitqueue_head(&nn->ntf_wq);
+ seqlock_init(&nn->boot_lock);
mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
if (IS_ERR(mnt)) {
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0d20fd161225..c83ddac22f38 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -172,6 +172,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
struct nfsd_readargs *argp = rqstp->rq_argp;
struct nfsd_readres *resp = rqstp->rq_resp;
__be32 nfserr;
+ u32 eof;
dprintk("nfsd: READ %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -195,7 +196,8 @@ nfsd_proc_read(struct svc_rqst *rqstp)
nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
argp->offset,
rqstp->rq_vec, argp->vlen,
- &resp->count);
+ &resp->count,
+ &eof);
if (nfserr) return nfserr;
return fh_getattr(&resp->fh, &resp->stat);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 18d94ea984ba..fdf7ed4bd5dd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -27,6 +27,7 @@
#include "cache.h"
#include "vfs.h"
#include "netns.h"
+#include "filecache.h"
#define NFSDDBG_FACILITY NFSDDBG_SVC
@@ -313,22 +314,17 @@ static int nfsd_startup_generic(int nrservs)
if (nfsd_users++)
return 0;
- /*
- * Readahead param cache - will no-op if it already exists.
- * (Note therefore results will be suboptimal if number of
- * threads is modified after nfsd start.)
- */
- ret = nfsd_racache_init(2*nrservs);
+ ret = nfsd_file_cache_init();
if (ret)
goto dec_users;
ret = nfs4_state_start();
if (ret)
- goto out_racache;
+ goto out_file_cache;
return 0;
-out_racache:
- nfsd_racache_shutdown();
+out_file_cache:
+ nfsd_file_cache_shutdown();
dec_users:
nfsd_users--;
return ret;
@@ -340,7 +336,7 @@ static void nfsd_shutdown_generic(void)
return;
nfs4_state_shutdown();
- nfsd_racache_shutdown();
+ nfsd_file_cache_shutdown();
}
static bool nfsd_needs_lockd(struct nfsd_net *nn)
@@ -348,6 +344,35 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn)
return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
}
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
+{
+ int seq = 0;
+
+ do {
+ read_seqbegin_or_lock(&nn->boot_lock, &seq);
+ /*
+ * This is opaque to client, so no need to byte-swap. Use
+ * __force to keep sparse happy. y2038 time_t overflow is
+ * irrelevant in this usage
+ */
+ verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+ verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
+ } while (need_seqretry(&nn->boot_lock, seq));
+ done_seqretry(&nn->boot_lock, seq);
+}
+
+static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
+{
+ ktime_get_real_ts64(&nn->nfssvc_boot);
+}
+
+void nfsd_reset_boot_verifier(struct nfsd_net *nn)
+{
+ write_seqlock(&nn->boot_lock);
+ nfsd_reset_boot_verifier_locked(nn);
+ write_sequnlock(&nn->boot_lock);
+}
+
static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -391,6 +416,7 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ nfsd_file_cache_purge(net);
nfs4_state_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
@@ -599,7 +625,7 @@ int nfsd_create_serv(struct net *net)
#endif
}
atomic_inc(&nn->ntf_refcnt);
- ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
+ nfsd_reset_boot_verifier(nn);
return 0;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 5dbd16946e8e..46f56afb6cb8 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -378,6 +378,7 @@ struct nfs4_client_reclaim {
struct list_head cr_strhash; /* hash by cr_name */
struct nfs4_client *cr_clp; /* pointer to associated clp */
struct xdr_netobj cr_name; /* recovery dir name */
+ struct xdr_netobj cr_princhash;
};
/* A reasonable value for REPLAY_ISIZE was estimated as follows:
@@ -506,7 +507,7 @@ struct nfs4_file {
};
struct list_head fi_clnt_odstate;
/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
- struct file * fi_fds[3];
+ struct nfsd_file *fi_fds[3];
/*
* Each open or lock stateid contributes 0-4 to the counts
* below depending on which bits are set in st_access_bitmap:
@@ -516,7 +517,7 @@ struct nfs4_file {
*/
atomic_t fi_access[2];
u32 fi_share_deny;
- struct file *fi_deleg_file;
+ struct nfsd_file *fi_deleg_file;
int fi_delegees;
struct knfsd_fh fi_fhandle;
bool fi_had_conflict;
@@ -565,7 +566,7 @@ struct nfs4_layout_stateid {
spinlock_t ls_lock;
struct list_head ls_layouts;
u32 ls_layout_type;
- struct file *ls_file;
+ struct nfsd_file *ls_file;
struct nfsd4_callback ls_recall;
stateid_t ls_recall_sid;
bool ls_recalled;
@@ -616,7 +617,7 @@ struct nfsd4_copy;
extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
- stateid_t *stateid, int flags, struct file **filp, bool *tmp_file);
+ stateid_t *stateid, int flags, struct nfsd_file **filp);
__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn);
@@ -645,7 +646,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
- struct nfsd_net *nn);
+ struct xdr_netobj princhash, struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
struct nfs4_file *find_file(struct knfsd_fh *fh);
@@ -657,7 +658,7 @@ static inline void get_nfs4_file(struct nfs4_file *fi)
{
refcount_inc(&fi->fi_ref);
}
-struct file *find_any_file(struct nfs4_file *f);
+struct nfsd_file *find_any_file(struct nfs4_file *f);
/* grace period management */
void nfsd4_end_grace(struct nfsd_net *nn);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 80933e4334d8..ffc78a0e28b2 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -126,6 +126,8 @@ DEFINE_NFSD_ERR_EVENT(read_err);
DEFINE_NFSD_ERR_EVENT(write_err);
#include "state.h"
+#include "filecache.h"
+#include "vfs.h"
DECLARE_EVENT_CLASS(nfsd_stateid_class,
TP_PROTO(stateid_t *stp),
@@ -164,6 +166,144 @@ DEFINE_STATEID_EVENT(layout_recall_done);
DEFINE_STATEID_EVENT(layout_recall_fail);
DEFINE_STATEID_EVENT(layout_recall_release);
+#define show_nf_flags(val) \
+ __print_flags(val, "|", \
+ { 1 << NFSD_FILE_HASHED, "HASHED" }, \
+ { 1 << NFSD_FILE_PENDING, "PENDING" }, \
+ { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \
+ { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \
+ { 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
+
+/* FIXME: This should probably be fleshed out in the future. */
+#define show_nf_may(val) \
+ __print_flags(val, "|", \
+ { NFSD_MAY_READ, "READ" }, \
+ { NFSD_MAY_WRITE, "WRITE" }, \
+ { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" })
+
+DECLARE_EVENT_CLASS(nfsd_file_class,
+ TP_PROTO(struct nfsd_file *nf),
+ TP_ARGS(nf),
+ TP_STRUCT__entry(
+ __field(unsigned int, nf_hashval)
+ __field(void *, nf_inode)
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned char, nf_may)
+ __field(struct file *, nf_file)
+ ),
+ TP_fast_assign(
+ __entry->nf_hashval = nf->nf_hashval;
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = atomic_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p",
+ __entry->nf_hashval,
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nf_may(__entry->nf_may),
+ __entry->nf_file)
+)
+
+#define DEFINE_NFSD_FILE_EVENT(name) \
+DEFINE_EVENT(nfsd_file_class, name, \
+ TP_PROTO(struct nfsd_file *nf), \
+ TP_ARGS(nf))
+
+DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked);
+
+TRACE_EVENT(nfsd_file_acquire,
+ TP_PROTO(struct svc_rqst *rqstp, unsigned int hash,
+ struct inode *inode, unsigned int may_flags,
+ struct nfsd_file *nf, __be32 status),
+
+ TP_ARGS(rqstp, hash, inode, may_flags, nf, status),
+
+ TP_STRUCT__entry(
+ __field(__be32, xid)
+ __field(unsigned int, hash)
+ __field(void *, inode)
+ __field(unsigned int, may_flags)
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned char, nf_may)
+ __field(struct file *, nf_file)
+ __field(__be32, status)
+ ),
+
+ TP_fast_assign(
+ __entry->xid = rqstp->rq_xid;
+ __entry->hash = hash;
+ __entry->inode = inode;
+ __entry->may_flags = may_flags;
+ __entry->nf_ref = nf ? atomic_read(&nf->nf_ref) : 0;
+ __entry->nf_flags = nf ? nf->nf_flags : 0;
+ __entry->nf_may = nf ? nf->nf_may : 0;
+ __entry->nf_file = nf ? nf->nf_file : NULL;
+ __entry->status = status;
+ ),
+
+ TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u",
+ be32_to_cpu(__entry->xid), __entry->hash, __entry->inode,
+ show_nf_may(__entry->may_flags), __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nf_may(__entry->nf_may), __entry->nf_file,
+ be32_to_cpu(__entry->status))
+);
+
+DECLARE_EVENT_CLASS(nfsd_file_search_class,
+ TP_PROTO(struct inode *inode, unsigned int hash, int found),
+ TP_ARGS(inode, hash, found),
+ TP_STRUCT__entry(
+ __field(struct inode *, inode)
+ __field(unsigned int, hash)
+ __field(int, found)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->hash = hash;
+ __entry->found = found;
+ ),
+ TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
+ __entry->inode, __entry->found)
+);
+
+#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
+DEFINE_EVENT(nfsd_file_search_class, name, \
+ TP_PROTO(struct inode *inode, unsigned int hash, int found), \
+ TP_ARGS(inode, hash, found))
+
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached);
+
+TRACE_EVENT(nfsd_file_fsnotify_handle_event,
+ TP_PROTO(struct inode *inode, u32 mask),
+ TP_ARGS(inode, mask),
+ TP_STRUCT__entry(
+ __field(struct inode *, inode)
+ __field(unsigned int, nlink)
+ __field(umode_t, mode)
+ __field(u32, mask)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->nlink = inode->i_nlink;
+ __entry->mode = inode->i_mode;
+ __entry->mask = mask;
+ ),
+ TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
+ __entry->nlink, __entry->mode, __entry->mask)
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c85783e536d5..bd0a385df3fc 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -44,38 +44,11 @@
#include "nfsd.h"
#include "vfs.h"
+#include "filecache.h"
#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
-
-/*
- * This is a cache of readahead params that help us choose the proper
- * readahead strategy. Initially, we set all readahead parameters to 0
- * and let the VFS handle things.
- * If you increase the number of cached files very much, you'll need to
- * add a hash table here.
- */
-struct raparms {
- struct raparms *p_next;
- unsigned int p_count;
- ino_t p_ino;
- dev_t p_dev;
- int p_set;
- struct file_ra_state p_ra;
- unsigned int p_hindex;
-};
-
-struct raparm_hbucket {
- struct raparms *pb_head;
- spinlock_t pb_lock;
-} ____cacheline_aligned_in_smp;
-
-#define RAPARM_HASH_BITS 4
-#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
-#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
-static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
-
/*
* Called from nfsd_lookup and encode_dirent. Check if we have crossed
* a mount point.
@@ -699,7 +672,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
}
#endif /* CONFIG_NFSD_V3 */
-static int nfsd_open_break_lease(struct inode *inode, int access)
+int nfsd_open_break_lease(struct inode *inode, int access)
{
unsigned int mode;
@@ -715,8 +688,8 @@ static int nfsd_open_break_lease(struct inode *inode, int access)
* and additional flags.
* N.B. After this call fhp needs an fh_put
*/
-__be32
-nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+static __be32
+__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
int may_flags, struct file **filp)
{
struct path path;
@@ -726,25 +699,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
__be32 err;
int host_err = 0;
- validate_process_creds();
-
- /*
- * If we get here, then the client has already done an "open",
- * and (hopefully) checked permission - so allow OWNER_OVERRIDE
- * in case a chmod has now revoked permission.
- *
- * Arguably we should also allow the owner override for
- * directories, but we never have and it doesn't seem to have
- * caused anyone a problem. If we were to change this, note
- * also that our filldir callbacks would need a variant of
- * lookup_one_len that doesn't check permissions.
- */
- if (type == S_IFREG)
- may_flags |= NFSD_MAY_OWNER_OVERRIDE;
- err = fh_verify(rqstp, fhp, type, may_flags);
- if (err)
- goto out;
-
path.mnt = fhp->fh_export->ex_path.mnt;
path.dentry = fhp->fh_dentry;
inode = d_inode(path.dentry);
@@ -798,67 +752,46 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
out_nfserr:
err = nfserrno(host_err);
out:
- validate_process_creds();
return err;
}
-struct raparms *
-nfsd_init_raparms(struct file *file)
+__be32
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ int may_flags, struct file **filp)
{
- struct inode *inode = file_inode(file);
- dev_t dev = inode->i_sb->s_dev;
- ino_t ino = inode->i_ino;
- struct raparms *ra, **rap, **frap = NULL;
- int depth = 0;
- unsigned int hash;
- struct raparm_hbucket *rab;
-
- hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
- rab = &raparm_hash[hash];
-
- spin_lock(&rab->pb_lock);
- for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
- if (ra->p_ino == ino && ra->p_dev == dev)
- goto found;
- depth++;
- if (ra->p_count == 0)
- frap = rap;
- }
- depth = nfsdstats.ra_size;
- if (!frap) {
- spin_unlock(&rab->pb_lock);
- return NULL;
- }
- rap = frap;
- ra = *frap;
- ra->p_dev = dev;
- ra->p_ino = ino;
- ra->p_set = 0;
- ra->p_hindex = hash;
-found:
- if (rap != &rab->pb_head) {
- *rap = ra->p_next;
- ra->p_next = rab->pb_head;
- rab->pb_head = ra;
- }
- ra->p_count++;
- nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
- spin_unlock(&rab->pb_lock);
+ __be32 err;
- if (ra->p_set)
- file->f_ra = ra->p_ra;
- return ra;
+ validate_process_creds();
+ /*
+ * If we get here, then the client has already done an "open",
+ * and (hopefully) checked permission - so allow OWNER_OVERRIDE
+ * in case a chmod has now revoked permission.
+ *
+ * Arguably we should also allow the owner override for
+ * directories, but we never have and it doesn't seem to have
+ * caused anyone a problem. If we were to change this, note
+ * also that our filldir callbacks would need a variant of
+ * lookup_one_len that doesn't check permissions.
+ */
+ if (type == S_IFREG)
+ may_flags |= NFSD_MAY_OWNER_OVERRIDE;
+ err = fh_verify(rqstp, fhp, type, may_flags);
+ if (!err)
+ err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ validate_process_creds();
+ return err;
}
-void nfsd_put_raparams(struct file *file, struct raparms *ra)
+__be32
+nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ int may_flags, struct file **filp)
{
- struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+ __be32 err;
- spin_lock(&rab->pb_lock);
- ra->p_ra = file->f_ra;
- ra->p_set = 1;
- ra->p_count--;
- spin_unlock(&rab->pb_lock);
+ validate_process_creds();
+ err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ validate_process_creds();
+ return err;
}
/*
@@ -901,12 +834,23 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
}
+static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len,
+ size_t expected)
+{
+ if (expected != 0 && len == 0)
+ return 1;
+ if (offset+len >= i_size_read(file_inode(file)))
+ return 1;
+ return 0;
+}
+
static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
- unsigned long *count, int host_err)
+ unsigned long *count, u32 *eof, ssize_t host_err)
{
if (host_err >= 0) {
nfsdstats.io_read += host_err;
+ *eof = nfsd_eof_on_read(file, offset, host_err, *count);
*count = host_err;
fsnotify_access(file);
trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
@@ -918,7 +862,8 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct file *file, loff_t offset, unsigned long *count)
+ struct file *file, loff_t offset, unsigned long *count,
+ u32 *eof)
{
struct splice_desc sd = {
.len = 0,
@@ -926,25 +871,27 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
.pos = offset,
.u.data = rqstp,
};
- int host_err;
+ ssize_t host_err;
trace_nfsd_read_splice(rqstp, fhp, offset, *count);
rqstp->rq_next_page = rqstp->rq_respages + 1;
host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
- return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
+ return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
- struct kvec *vec, int vlen, unsigned long *count)
+ struct kvec *vec, int vlen, unsigned long *count,
+ u32 *eof)
{
struct iov_iter iter;
- int host_err;
+ loff_t ppos = offset;
+ ssize_t host_err;
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
iov_iter_kvec(&iter, READ, vec, vlen, *count);
- host_err = vfs_iter_read(file, &iter, &offset, 0);
- return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
+ host_err = vfs_iter_read(file, &iter, &ppos, 0);
+ return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
/*
@@ -1025,8 +972,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
nfsdstats.io_write += *cnt;
fsnotify_modify(file);
- if (stable && use_wgather)
+ if (stable && use_wgather) {
host_err = wait_for_concurrent_writes(file);
+ if (host_err < 0)
+ nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ }
out_nfserr:
if (host_err >= 0) {
@@ -1047,27 +998,25 @@ out_nfserr:
* N.B. After this call fhp needs an fh_put
*/
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+ loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
+ u32 *eof)
{
+ struct nfsd_file *nf;
struct file *file;
- struct raparms *ra;
__be32 err;
trace_nfsd_read_start(rqstp, fhp, offset, *count);
- err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+ err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
if (err)
return err;
- ra = nfsd_init_raparms(file);
-
+ file = nf->nf_file;
if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
- err = nfsd_splice_read(rqstp, fhp, file, offset, count);
+ err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
else
- err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count);
+ err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
- if (ra)
- nfsd_put_raparams(file, ra);
- fput(file);
+ nfsd_file_put(nf);
trace_nfsd_read_done(rqstp, fhp, offset, *count);
@@ -1083,17 +1032,18 @@ __be32
nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
struct kvec *vec, int vlen, unsigned long *cnt, int stable)
{
- struct file *file = NULL;
- __be32 err = 0;
+ struct nfsd_file *nf;
+ __be32 err;
trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
- err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+ err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
if (err)
goto out;
- err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
- fput(file);
+ err = nfsd_vfs_write(rqstp, fhp, nf->nf_file, offset, vec,
+ vlen, cnt, stable);
+ nfsd_file_put(nf);
out:
trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
return err;
@@ -1113,9 +1063,9 @@ __be32
nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, unsigned long count)
{
- struct file *file;
- loff_t end = LLONG_MAX;
- __be32 err = nfserr_inval;
+ struct nfsd_file *nf;
+ loff_t end = LLONG_MAX;
+ __be32 err = nfserr_inval;
if (offset < 0)
goto out;
@@ -1125,20 +1075,27 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
- err = nfsd_open(rqstp, fhp, S_IFREG,
- NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
+ err = nfsd_file_acquire(rqstp, fhp,
+ NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
if (err)
goto out;
if (EX_ISSYNC(fhp->fh_export)) {
- int err2 = vfs_fsync_range(file, offset, end, 0);
+ int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
- if (err2 != -EINVAL)
- err = nfserrno(err2);
- else
+ switch (err2) {
+ case 0:
+ break;
+ case -EINVAL:
err = nfserr_notsupp;
+ break;
+ default:
+ err = nfserrno(err2);
+ nfsd_reset_boot_verifier(net_generic(nf->nf_net,
+ nfsd_net_id));
+ }
}
- fput(file);
+ nfsd_file_put(nf);
out:
return err;
}
@@ -1659,6 +1616,26 @@ out_nfserr:
goto out_unlock;
}
+static void
+nfsd_close_cached_files(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (inode && S_ISREG(inode->i_mode))
+ nfsd_file_close_inode_sync(inode);
+}
+
+static bool
+nfsd_has_cached_files(struct dentry *dentry)
+{
+ bool ret = false;
+ struct inode *inode = d_inode(dentry);
+
+ if (inode && S_ISREG(inode->i_mode))
+ ret = nfsd_file_is_cached(inode);
+ return ret;
+}
+
/*
* Rename a file
* N.B. After this call _both_ ffhp and tfhp need an fh_put
@@ -1671,6 +1648,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
struct inode *fdir, *tdir;
__be32 err;
int host_err;
+ bool has_cached = false;
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
if (err)
@@ -1689,6 +1667,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
goto out;
+retry:
host_err = fh_want_write(ffhp);
if (host_err) {
err = nfserrno(host_err);
@@ -1728,11 +1707,16 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
goto out_dput_new;
- host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
- if (!host_err) {
- host_err = commit_metadata(tfhp);
- if (!host_err)
- host_err = commit_metadata(ffhp);
+ if (nfsd_has_cached_files(ndentry)) {
+ has_cached = true;
+ goto out_dput_old;
+ } else {
+ host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+ if (!host_err) {
+ host_err = commit_metadata(tfhp);
+ if (!host_err)
+ host_err = commit_metadata(ffhp);
+ }
}
out_dput_new:
dput(ndentry);
@@ -1745,12 +1729,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
* as that would do the wrong thing if the two directories
* were the same, so again we do it by hand.
*/
- fill_post_wcc(ffhp);
- fill_post_wcc(tfhp);
+ if (!has_cached) {
+ fill_post_wcc(ffhp);
+ fill_post_wcc(tfhp);
+ }
unlock_rename(tdentry, fdentry);
ffhp->fh_locked = tfhp->fh_locked = false;
fh_drop_write(ffhp);
+ /*
+ * If the target dentry has cached open files, then we need to try to
+ * close them prior to doing the rename. Flushing delayed fput
+ * shouldn't be done with locks held however, so we delay it until this
+ * point and then reattempt the whole shebang.
+ */
+ if (has_cached) {
+ has_cached = false;
+ nfsd_close_cached_files(ndentry);
+ dput(ndentry);
+ goto retry;
+ }
out:
return err;
}
@@ -1797,10 +1795,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = d_inode(rdentry)->i_mode & S_IFMT;
- if (type != S_IFDIR)
+ if (type != S_IFDIR) {
+ nfsd_close_cached_files(rdentry);
host_err = vfs_unlink(dirp, rdentry, NULL);
- else
+ } else {
host_err = vfs_rmdir(dirp, rdentry);
+ }
+
if (!host_err)
host_err = commit_metadata(fhp);
dput(rdentry);
@@ -2074,63 +2075,3 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
return err? nfserrno(err) : 0;
}
-
-void
-nfsd_racache_shutdown(void)
-{
- struct raparms *raparm, *last_raparm;
- unsigned int i;
-
- dprintk("nfsd: freeing readahead buffers.\n");
-
- for (i = 0; i < RAPARM_HASH_SIZE; i++) {
- raparm = raparm_hash[i].pb_head;
- while(raparm) {
- last_raparm = raparm;
- raparm = raparm->p_next;
- kfree(last_raparm);
- }
- raparm_hash[i].pb_head = NULL;
- }
-}
-/*
- * Initialize readahead param cache
- */
-int
-nfsd_racache_init(int cache_size)
-{
- int i;
- int j = 0;
- int nperbucket;
- struct raparms **raparm = NULL;
-
-
- if (raparm_hash[0].pb_head)
- return 0;
- nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
- nperbucket = max(2, nperbucket);
- cache_size = nperbucket * RAPARM_HASH_SIZE;
-
- dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
-
- for (i = 0; i < RAPARM_HASH_SIZE; i++) {
- spin_lock_init(&raparm_hash[i].pb_lock);
-
- raparm = &raparm_hash[i].pb_head;
- for (j = 0; j < nperbucket; j++) {
- *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
- if (!*raparm)
- goto out_nomem;
- raparm = &(*raparm)->p_next;
- }
- *raparm = NULL;
- }
-
- nfsdstats.ra_size = cache_size;
- return 0;
-
-out_nomem:
- dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
- nfsd_racache_shutdown();
- return -ENOMEM;
-}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index db351247892d..a13fd9d7e1f5 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -40,8 +40,6 @@
typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
/* nfsd/vfs.c */
-int nfsd_racache_init(int);
-void nfsd_racache_shutdown(void);
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct svc_export **expp);
__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
@@ -75,18 +73,23 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
loff_t, unsigned long);
#endif /* CONFIG_NFSD_V3 */
+int nfsd_open_break_lease(struct inode *, int);
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-struct raparms;
+__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
+ int, struct file **);
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
- unsigned long *count);
+ unsigned long *count,
+ u32 *eof);
__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
struct kvec *vec, int vlen,
- unsigned long *count);
+ unsigned long *count,
+ u32 *eof);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
- loff_t, struct kvec *, int, unsigned long *);
+ loff_t, struct kvec *, int, unsigned long *,
+ u32 *eof);
__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
struct kvec *, int, unsigned long *, int);
__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -115,9 +118,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);
-struct raparms *nfsd_init_raparms(struct file *file);
-void nfsd_put_raparams(struct file *file, struct raparms *ra);
-
static inline int fh_want_write(struct svc_fh *fh)
{
int ret;
@@ -152,23 +152,4 @@ static inline int nfsd_create_is_exclusive(int createmode)
|| createmode == NFS4_CREATE_EXCLUSIVE4_1;
}
-static inline bool nfsd_eof_on_read(long requested, long read,
- loff_t offset, loff_t size)
-{
- /* We assume a short read means eof: */
- if (requested > read)
- return true;
- /*
- * A non-short read might also reach end of file. The spec
- * still requires us to set eof in that case.
- *
- * Further operations may have modified the file size since
- * the read, so the following check is not atomic with the read.
- * We've only seen that cause a problem for a client in the case
- * where the read returned a count of 0 without setting eof.
- * That case was fixed by the addition of the above check.
- */
- return (offset + read >= size);
-}
-
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 2cb29e961a76..99ff9f403ff1 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -151,7 +151,7 @@ struct nfsd3_readres {
__be32 status;
struct svc_fh fh;
unsigned long count;
- int eof;
+ __u32 eof;
};
struct nfsd3_writeres {
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d64c870f998a..f4737d66ee98 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -273,15 +273,14 @@ struct nfsd4_open_downgrade {
struct nfsd4_read {
- stateid_t rd_stateid; /* request */
- u64 rd_offset; /* request */
- u32 rd_length; /* request */
- int rd_vlen;
- struct file *rd_filp;
- bool rd_tmp_file;
+ stateid_t rd_stateid; /* request */
+ u64 rd_offset; /* request */
+ u32 rd_length; /* request */
+ int rd_vlen;
+ struct nfsd_file *rd_nf;
- struct svc_rqst *rd_rqstp; /* response */
- struct svc_fh * rd_fhp; /* response */
+ struct svc_rqst *rd_rqstp; /* response */
+ struct svc_fh *rd_fhp; /* response */
};
struct nfsd4_readdir {
@@ -538,8 +537,8 @@ struct nfsd4_copy {
struct nfs4_client *cp_clp;
- struct file *file_src;
- struct file *file_dst;
+ struct nfsd_file *nf_src;
+ struct nfsd_file *nf_dst;
stateid_t cp_stateid;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 5a00121fb219..f3462828a0e2 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -54,8 +54,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
{
fsnotify_destroy_marks(&sb->s_fsnotify_marks);
}
-/* Wait until all marks queued for destruction are destroyed */
-extern void fsnotify_wait_marks_destroyed(void);
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0391190305cc..133f723aca07 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -108,6 +108,7 @@ void fsnotify_put_group(struct fsnotify_group *group)
if (refcount_dec_and_test(&group->refcnt))
fsnotify_final_destroy_group(group);
}
+EXPORT_SYMBOL_GPL(fsnotify_put_group);
/*
* Create a new fsnotify_group and hold a reference for the group returned.
@@ -137,6 +138,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
return group;
}
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
int fsnotify_fasync(int fd, struct file *file, int on)
{
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 99ddd126f6f0..1d96216dffd1 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -276,6 +276,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
queue_delayed_work(system_unbound_wq, &reaper_work,
FSNOTIFY_REAPER_DELAY);
}
+EXPORT_SYMBOL_GPL(fsnotify_put_mark);
/*
* Get mark reference when we found the mark via lockless traversal of object
@@ -430,6 +431,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
mutex_unlock(&group->mark_mutex);
fsnotify_free_mark(mark);
}
+EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
/*
* Sorting function for lists of fsnotify marks.
@@ -685,6 +687,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
mutex_unlock(&group->mark_mutex);
return ret;
}
+EXPORT_SYMBOL_GPL(fsnotify_add_mark);
/*
* Given a list of marks, find the mark associated with given group. If found
@@ -711,6 +714,7 @@ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
spin_unlock(&conn->lock);
return NULL;
}
+EXPORT_SYMBOL_GPL(fsnotify_find_mark);
/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
@@ -809,6 +813,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
mark->group = group;
WRITE_ONCE(mark->connector, NULL);
}
+EXPORT_SYMBOL_GPL(fsnotify_init_mark);
/*
* Destroy all marks in destroy_list, waits for SRCU period to finish before
@@ -837,3 +842,4 @@ void fsnotify_wait_marks_destroyed(void)
{
flush_delayed_work(&reaper_work);
}
+EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 20c841a906f2..3aac5c917afe 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -71,7 +71,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
}
/* Read, map, and pin the page. */
page = ntfs_map_page(mft_vi->i_mapping, index);
- if (likely(!IS_ERR(page))) {
+ if (!IS_ERR(page)) {
/* Catch multi sector transfer fixup errors. */
if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
ofs)))) {
@@ -154,7 +154,7 @@ MFT_RECORD *map_mft_record(ntfs_inode *ni)
mutex_lock(&ni->mrec_lock);
m = map_mft_record_page(ni);
- if (likely(!IS_ERR(m)))
+ if (!IS_ERR(m))
return m;
mutex_unlock(&ni->mrec_lock);
@@ -271,7 +271,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
m = map_mft_record(ni);
/* map_mft_record() has incremented this on success. */
atomic_dec(&ni->count);
- if (likely(!IS_ERR(m))) {
+ if (!IS_ERR(m)) {
/* Verify the sequence number. */
if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
ntfs_debug("Done 1.");
@@ -1303,7 +1303,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
(ll - 1) >> vol->cluster_size_bits, NULL);
- if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+ if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
up_write(&mftbmp_ni->runlist.lock);
ntfs_error(vol->sb, "Failed to determine last allocated "
"cluster of mft bitmap attribute.");
@@ -1734,7 +1734,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
read_unlock_irqrestore(&mft_ni->size_lock, flags);
rl = ntfs_attr_find_vcn_nolock(mft_ni,
(ll - 1) >> vol->cluster_size_bits, NULL);
- if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+ if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
up_write(&mft_ni->runlist.lock);
ntfs_error(vol->sb, "Failed to determine last allocated "
"cluster of mft data attribute.");
@@ -1776,7 +1776,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
do {
rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
true);
- if (likely(!IS_ERR(rl2)))
+ if (!IS_ERR(rl2))
break;
if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
ntfs_error(vol->sb, "Failed to allocate the minimal "
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2d3cc9e3395d..4e6a44bc654c 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -115,7 +115,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
dent_ino = MREF(mref);
ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
dent_inode = ntfs_iget(vol->sb, dent_ino);
- if (likely(!IS_ERR(dent_inode))) {
+ if (!IS_ERR(dent_inode)) {
/* Consistency check. */
if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
NTFS_I(dent_inode)->seq_no ||
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 508744a93180..97932fb5179c 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -951,7 +951,7 @@ mpa_err:
}
/* Now combine the new and old runlists checking for overlaps. */
old_rl = ntfs_runlists_merge(old_rl, rl);
- if (likely(!IS_ERR(old_rl)))
+ if (!IS_ERR(old_rl))
return old_rl;
ntfs_free(rl);
ntfs_error(vol->sb, "Failed to merge runlists.");
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 29621d40f448..7dc3bc604f78 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1475,7 +1475,7 @@ not_enabled:
kfree(name);
/* Get the inode. */
tmp_ino = ntfs_iget(vol->sb, MREF(mref));
- if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) {
+ if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
if (!IS_ERR(tmp_ino))
iput(tmp_ino);
ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8de1c9d644f6..9cd0a6815933 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2049,7 +2049,8 @@ out_write_size:
inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2146,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2236,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d6f7b299eb23..efeea208fdeb 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
if (inode_alloc)
inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 90c830e3758e..d8507972ee13 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
return loc->xl_ops->xlo_check_space(loc, xi);
}
-static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
-{
- loc->xl_ops->xlo_add_entry(loc, name_hash);
- loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
- /*
- * We can't leave the new entry's xe_name_offset at zero or
- * add_namevalue() will go nuts. We set it to the size of our
- * storage so that it can never be less than any other entry.
- */
- loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
-}
-
static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
struct ocfs2_xattr_info *xi)
{
@@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
if (rc)
goto out;
- if (loc->xl_entry) {
- if (ocfs2_xa_can_reuse_entry(loc, xi)) {
- orig_value_size = loc->xl_entry->xe_value_size;
- rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
- if (rc)
- goto out;
- goto alloc_value;
- }
+ if (!loc->xl_entry) {
+ rc = -EINVAL;
+ goto out;
+ }
- if (!ocfs2_xattr_is_local(loc->xl_entry)) {
- orig_clusters = ocfs2_xa_value_clusters(loc);
- rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
- if (rc) {
- mlog_errno(rc);
- ocfs2_xa_cleanup_value_truncate(loc,
- "overwriting",
- orig_clusters);
- goto out;
- }
+ if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+ orig_value_size = loc->xl_entry->xe_value_size;
+ rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+ if (rc)
+ goto out;
+ goto alloc_value;
+ }
+
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ ocfs2_xa_cleanup_value_truncate(loc,
+ "overwriting",
+ orig_clusters);
+ goto out;
}
- ocfs2_xa_wipe_namevalue(loc);
- } else
- ocfs2_xa_add_entry(loc, name_hash);
+ }
+ ocfs2_xa_wipe_namevalue(loc);
/*
* If we get here, we have a blank entry. Fill it. We grow our
diff --git a/fs/open.c b/fs/open.c
index c60cd22cc052..b62f5c0923a8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -776,7 +776,7 @@ static int do_dentry_open(struct file *f,
f->f_mode |= FMODE_ATOMIC_POS;
f->f_op = fops_get(inode->i_fop);
- if (unlikely(WARN_ON(!f->f_op))) {
+ if (WARN_ON(!f->f_op)) {
error = -ENODEV;
goto cleanup_all;
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f5834488b67d..e2ed8e08cc7a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -31,6 +31,7 @@
#include <linux/ioport.h>
#include <linux/memory.h>
#include <linux/sched/task.h>
+#include <linux/security.h>
#include <asm/sections.h>
#include "internal.h"
@@ -545,9 +546,14 @@ out:
static int open_kcore(struct inode *inode, struct file *filp)
{
+ int ret = security_locked_down(LOCKDOWN_KCORE);
+
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
+ if (ret)
+ return ret;
+
filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!filp->private_data)
return -ENOMEM;
diff --git a/fs/readdir.c b/fs/readdir.c
index 2f6a4534e0df..d26d5ea4de7b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,9 +20,23 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/compat.h>
-
#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+/*
+ * Note the "unsafe_put_user() semantics: we goto a
+ * label for errors.
+ */
+#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \
+ char __user *dst = (_dst); \
+ const char *src = (_src); \
+ size_t len = (_len); \
+ unsafe_put_user(0, dst+len, label); \
+ unsafe_copy_to_user(dst, src, len, label); \
+} while (0)
+
+
int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
@@ -65,6 +79,40 @@ out:
EXPORT_SYMBOL(iterate_dir);
/*
+ * POSIX says that a dirent name cannot contain NULL or a '/'.
+ *
+ * It's not 100% clear what we should really do in this case.
+ * The filesystem is clearly corrupted, but returning a hard
+ * error means that you now don't see any of the other names
+ * either, so that isn't a perfect alternative.
+ *
+ * And if you return an error, what error do you use? Several
+ * filesystems seem to have decided on EUCLEAN being the error
+ * code for EFSCORRUPTED, and that may be the error to use. Or
+ * just EIO, which is perhaps more obvious to users.
+ *
+ * In order to see the other file names in the directory, the
+ * caller might want to make this a "soft" error: skip the
+ * entry, and return the error at the end instead.
+ *
+ * Note that this should likely do a "memchr(name, 0, len)"
+ * check too, since that would be filesystem corruption as
+ * well. However, that case can't actually confuse user space,
+ * which has to do a strlen() on the name anyway to find the
+ * filename length, and the above "soft error" worry means
+ * that it's probably better left alone until we have that
+ * issue clarified.
+ */
+static int verify_dirent_name(const char *name, int len)
+{
+ if (!len)
+ return -EIO;
+ if (memchr(name, '/', len))
+ return -EIO;
+ return 0;
+}
+
+/*
* Traditional linux readdir() handling..
*
* "count=1" is a special case, meaning that the buffer is one
@@ -173,6 +221,9 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
+ buf->error = verify_dirent_name(name, namlen);
+ if (unlikely(buf->error))
+ return buf->error;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
return -EINVAL;
@@ -182,28 +233,31 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
return -EOVERFLOW;
}
dirent = buf->previous;
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user(d_ino, &dirent->d_ino))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
- goto efault;
- if (__put_user(d_type, (char __user *) dirent + reclen - 1))
+ if (dirent && signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Note! This range-checks 'previous' (which may be NULL).
+ * The real range was checked in getdents
+ */
+ if (!user_access_begin(dirent, sizeof(*dirent)))
goto efault;
+ if (dirent)
+ unsafe_put_user(offset, &dirent->d_off, efault_end);
+ dirent = buf->current_dir;
+ unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
+ unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
+ unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end);
+ unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
+ user_access_end();
+
buf->previous = dirent;
dirent = (void __user *)dirent + reclen;
buf->current_dir = dirent;
buf->count -= reclen;
return 0;
+efault_end:
+ user_access_end();
efault:
buf->error = -EFAULT;
return -EFAULT;
@@ -259,34 +313,38 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
+ buf->error = verify_dirent_name(name, namlen);
+ if (unlikely(buf->error))
+ return buf->error;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
return -EINVAL;
dirent = buf->previous;
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user(ino, &dirent->d_ino))
- goto efault;
- if (__put_user(0, &dirent->d_off))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (__put_user(d_type, &dirent->d_type))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
+ if (dirent && signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Note! This range-checks 'previous' (which may be NULL).
+ * The real range was checked in getdents
+ */
+ if (!user_access_begin(dirent, sizeof(*dirent)))
goto efault;
+ if (dirent)
+ unsafe_put_user(offset, &dirent->d_off, efault_end);
+ dirent = buf->current_dir;
+ unsafe_put_user(ino, &dirent->d_ino, efault_end);
+ unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
+ unsafe_put_user(d_type, &dirent->d_type, efault_end);
+ unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
+ user_access_end();
+
buf->previous = dirent;
dirent = (void __user *)dirent + reclen;
buf->current_dir = dirent;
buf->count -= reclen;
return 0;
+efault_end:
+ user_access_end();
efault:
buf->error = -EFAULT;
return -EFAULT;
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 9c02d96d3a42..4075e41408b4 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -239,10 +239,8 @@ static int balance_leaf_when_delete_left(struct tree_balance *tb)
static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int item_pos = PATH_LAST_POSITION(tb->tb_path);
struct buffer_info bi;
int n;
- struct item_head *ih;
RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
"vs- 12000: level: wrong FR %z", tb->FR[0]);
@@ -251,7 +249,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
"PAP-12010: tree can not be empty");
- ih = item_head(tbS0, item_pos);
buffer_info_init_tbS0(tb, &bi);
/* Delete or truncate the item */
@@ -298,7 +295,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
/* part of new item falls into L[0] */
int new_item_len, shift;
- int version;
ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
@@ -317,8 +313,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
min_t(int, tb->zeroes_num, ih_item_len(ih)));
- version = ih_version(ih);
-
/*
* Calculate key component, item length and body to
* insert into S[0]
@@ -632,7 +626,6 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int n = B_NR_ITEMS(tbS0);
struct buffer_info bi;
- int ret;
/* new item or part of it doesn't fall into R[0] */
if (n - tb->rnum[0] >= tb->item_pos) {
@@ -646,13 +639,11 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
loff_t old_key_comp, old_len, r_zeroes_number;
const char *r_body;
- int version, shift;
+ int shift;
loff_t offset;
leaf_shift_right(tb, tb->rnum[0] - 1, -1);
- version = ih_version(ih);
-
/* Remember key component and item length */
old_key_comp = le_ih_k_offset(ih);
old_len = ih_item_len(ih);
@@ -698,7 +689,7 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
/* whole new item falls into R[0] */
/* Shift rnum[0]-1 items to R[0] */
- ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
+ leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
/* Insert new item into R[0] */
buffer_info_init_right(tb, &bi);
@@ -950,14 +941,12 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
int old_key_comp, old_len, r_zeroes_number;
const char *r_body;
- int version;
/* Move snum[i]-1 items from S[0] to S_new[i] */
leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
tb->S_new[i]);
/* Remember key component and item length */
- version = ih_version(ih);
old_key_comp = le_ih_k_offset(ih);
old_len = ih_item_len(ih);
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6b0ddb2a9091..117092224111 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -376,7 +376,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
int to, int to_bytes, short *snum012, int flow)
{
int i;
- int cur_free;
int units;
struct virtual_node *vn = tb->tb_vn;
int total_node_size, max_node_size, current_item_size;
@@ -438,7 +437,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
/* leaf level */
needed_nodes = 1;
total_node_size = 0;
- cur_free = max_node_size;
/* start from 'from'-th item */
start_item = from;
@@ -1734,14 +1732,12 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
* and Fh is its father.
*/
struct buffer_head *Sh, *Fh;
- int maxsize, ret;
+ int ret;
int lfree, rfree /* free space in L and R */ ;
Sh = PATH_H_PBUFFER(tb->tb_path, h);
Fh = PATH_H_PPARENT(tb->tb_path, h);
- maxsize = MAX_CHILD_SIZE(Sh);
-
/*
* using tb->insert_size[h], which is negative in this case,
* create_virtual_node calculates:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4517a1394c6f..4b3e3e73b512 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -891,7 +891,6 @@ static int flush_older_commits(struct super_block *s,
struct list_head *entry;
unsigned int trans_id = jl->j_trans_id;
unsigned int other_trans_id;
- unsigned int first_trans_id;
find_first:
/*
@@ -914,8 +913,6 @@ find_first:
return 0;
}
- first_trans_id = first_jl->j_trans_id;
-
entry = &first_jl->j_list;
while (1) {
other_jl = JOURNAL_LIST_ENTRY(entry);
@@ -1351,7 +1348,7 @@ static int flush_journal_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall)
{
struct reiserfs_journal_list *pjl;
- struct reiserfs_journal_cnode *cn, *last;
+ struct reiserfs_journal_cnode *cn;
int count;
int was_jwait = 0;
int was_dirty = 0;
@@ -1509,7 +1506,6 @@ static int flush_journal_list(struct super_block *s,
b_blocknr, __func__);
}
free_cnode:
- last = cn;
cn = cn->next;
if (saved_bh) {
/*
@@ -1792,7 +1788,6 @@ static int flush_used_journal_lists(struct super_block *s,
{
unsigned long len = 0;
unsigned long cur_len;
- int ret;
int i;
int limit = 256;
struct reiserfs_journal_list *tjl;
@@ -1829,9 +1824,9 @@ static int flush_used_journal_lists(struct super_block *s,
* transactions, but only bother if we've actually spanned
* across multiple lists
*/
- if (flush_jl != jl) {
- ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
- }
+ if (flush_jl != jl)
+ kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+
flush_journal_list(s, flush_jl, 1);
put_journal_list(s, flush_jl);
put_journal_list(s, jl);
@@ -1911,7 +1906,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
struct super_block *sb, int error)
{
struct reiserfs_transaction_handle myth;
- int flushed = 0;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
/*
@@ -1933,7 +1927,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1);
journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
do_journal_end(&myth, FLUSH_ALL);
- flushed = 1;
}
}
@@ -3444,9 +3437,8 @@ static int remove_from_transaction(struct super_block *sb,
if (cn == journal->j_last) {
journal->j_last = cn->prev;
}
- if (bh)
- remove_journal_hash(sb, journal->j_hash_table, NULL,
- bh->b_blocknr, 0);
+ remove_journal_hash(sb, journal->j_hash_table, NULL,
+ bh->b_blocknr, 0);
clear_buffer_journaled(bh); /* don't log this one */
if (!already_cleaned) {
@@ -3988,7 +3980,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
struct buffer_head *c_bh; /* commit bh */
struct buffer_head *d_bh; /* desc bh */
int cur_write_start = 0; /* start index of current log write */
- int old_start;
int i;
int flush;
int wait_on_commit;
@@ -4245,7 +4236,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
journal->j_num_work_lists++;
/* reset journal values for the next transaction */
- old_start = journal->j_start;
journal->j_start =
(journal->j_start + journal->j_len +
2) % SB_ONDISK_JOURNAL_SIZE(sb);
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index f5cebd70d903..7f868569d4d0 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -1322,7 +1322,7 @@ void leaf_paste_entries(struct buffer_info *bi,
char *item;
struct reiserfs_de_head *deh;
char *insert_point;
- int i, old_entry_num;
+ int i;
struct buffer_head *bh = bi->bi_bh;
if (new_entry_count == 0)
@@ -1362,7 +1362,6 @@ void leaf_paste_entries(struct buffer_info *bi,
put_deh_location(&deh[i],
deh_location(&deh[i]) + paste_size);
- old_entry_num = ih_entry_count(ih);
put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
/* prepare space for pasted records */
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 415d66ca87d1..34baf5c0f265 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -183,13 +183,12 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s)
int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
int old_max = sb_oid_maxsize(disk_sb);
struct reiserfs_super_block_v1 *disk_sb_v1;
- __le32 *objectid_map, *new_objectid_map;
+ __le32 *objectid_map;
int i;
disk_sb_v1 =
(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
objectid_map = (__le32 *) (disk_sb_v1 + 1);
- new_objectid_map = (__le32 *) (disk_sb + 1);
if (cur_size > new_size) {
/*
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 9fed1c05f1f4..500f2000eb41 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -746,9 +746,6 @@ static void check_leaf_block_head(struct buffer_head *bh)
static void check_internal_block_head(struct buffer_head *bh)
{
- struct block_head *blkh;
-
- blkh = B_BLK_HEAD(bh);
if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 0037aea97d39..da9ebe33882b 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -593,7 +593,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
struct buffer_head *bh;
struct path_element *last_element;
int node_level, retval;
- int right_neighbor_of_leaf_node;
int fs_gen;
struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
@@ -614,8 +613,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
pathrelse(search_path);
- right_neighbor_of_leaf_node = 0;
-
/*
* With each iteration of this loop we search through the items in the
* current node, and calculate the next current node(next path element)
@@ -701,7 +698,6 @@ io_error:
*/
block_number = SB_ROOT_BLOCK(sb);
expected_level = -1;
- right_neighbor_of_leaf_node = 0;
/* repeat search from the root */
continue;
diff --git a/fs/statfs.c b/fs/statfs.c
index eea7af6f2f22..2616424012ea 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -318,19 +318,10 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *,
static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
{
struct compat_statfs64 buf;
- if (sizeof(ubuf->f_bsize) == 4) {
- if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
- kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /* f_files and f_ffree may be -1; it's okay
- * to stuff that into 32 bits */
- if (kbuf->f_files != 0xffffffffffffffffULL
- && (kbuf->f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (kbuf->f_ffree != 0xffffffffffffffffULL
- && (kbuf->f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
+
+ if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+
memset(&buf, 0, sizeof(struct compat_statfs64));
buf.f_type = kbuf->f_type;
buf.f_bsize = kbuf->f_bsize;
diff --git a/fs/super.c b/fs/super.c
index f627b7c53d2b..cfadab2cbf35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1300,6 +1300,7 @@ int get_tree_bdev(struct fs_context *fc,
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ blkdev_put(bdev, mode);
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
return -EBUSY;
}
@@ -1308,8 +1309,10 @@ int get_tree_bdev(struct fs_context *fc,
fc->sget_key = bdev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s))
+ if (IS_ERR(s)) {
+ blkdev_put(bdev, mode);
return PTR_ERR(s);
+ }
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index eeeae0475da9..0caa151cae4e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/tracefs.h>
#include <linux/fsnotify.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/magic.h>
@@ -390,6 +391,9 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *dentry;
struct inode *inode;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!(mode & S_IFMT))
mode |= S_IFREG;
BUG_ON(!S_ISREG(mode));
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index fe6d804a38dc..f9fd18670e22 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1272,21 +1272,23 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
}
static __always_inline int validate_range(struct mm_struct *mm,
- __u64 start, __u64 len)
+ __u64 *start, __u64 len)
{
__u64 task_size = mm->task_size;
- if (start & ~PAGE_MASK)
+ *start = untagged_addr(*start);
+
+ if (*start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
- if (start < mmap_min_addr)
+ if (*start < mmap_min_addr)
return -EINVAL;
- if (start >= task_size)
+ if (*start >= task_size)
return -EINVAL;
- if (len > task_size - start)
+ if (len > task_size - *start)
return -EINVAL;
return 0;
}
@@ -1336,7 +1338,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
}
- ret = validate_range(mm, uffdio_register.range.start,
+ ret = validate_range(mm, &uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
@@ -1525,7 +1527,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
- ret = validate_range(mm, uffdio_unregister.start,
+ ret = validate_range(mm, &uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
@@ -1676,7 +1678,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
- ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
@@ -1716,7 +1718,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
@@ -1772,7 +1774,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5de296b34ab1..14fbdf22b7e7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -28,12 +28,11 @@ xfs_get_aghdr_buf(
struct xfs_mount *mp,
xfs_daddr_t blkno,
size_t numblks,
- int flags,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
if (!bp)
return NULL;
@@ -345,7 +344,7 @@ xfs_ag_init_hdr(
{
struct xfs_buf *bp;
- bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+ bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
if (!bp)
return -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 58fa85cec325..d6ed5d2c07c2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -81,9 +81,10 @@ typedef struct xfs_alloc_arg {
/*
* Defines for datatype
*/
-#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */
-#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */
-#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
+#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
static inline bool
xfs_alloc_is_userdata(int datatype)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b9f019603d0b..f0089e862216 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -826,32 +826,17 @@ xfs_attr_shortform_to_leaf(
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
- if (error) {
- /*
- * If we hit an IO error middle of the transaction inside
- * grow_inode(), we may have inconsistent data. Bail out.
- */
- if (error == -EIO)
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
ASSERT(blkno == 0);
error = xfs_attr3_leaf_create(args, blkno, &bp);
- if (error) {
- /* xfs_attr3_leaf_create may not have instantiated a block */
- if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 054b4ce30033..02469d59c787 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -792,6 +792,7 @@ out_root_realloc:
*/
void
xfs_bmap_local_to_extents_empty(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork)
{
@@ -808,6 +809,7 @@ xfs_bmap_local_to_extents_empty(
ifp->if_u1.if_root = NULL;
ifp->if_height = 0;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -840,7 +842,7 @@ xfs_bmap_local_to_extents(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags = XFS_ILOG_CORE;
goto done;
}
@@ -887,7 +889,7 @@ xfs_bmap_local_to_extents(
/* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
ifp->if_u1.if_root = NULL;
@@ -4042,8 +4044,12 @@ xfs_bmapi_allocate(
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->datatype = XFS_ALLOC_NOBUSY;
- if (whichfork == XFS_DATA_FORK && bma->offset == 0)
- bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+ if (whichfork == XFS_DATA_FORK) {
+ if (bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+ else
+ bma->datatype |= XFS_ALLOC_USERDATA;
+ }
if (bma->flags & XFS_BMAPI_ZERO)
bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
}
@@ -5621,6 +5627,11 @@ xfs_bmse_merge(
if (error)
return error;
+ /* change to extent format if required after extent removal */
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork);
+ if (error)
+ return error;
+
done:
xfs_iext_remove(ip, icur, 0);
xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 5bb446d80542..e2798c6f3a5f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -182,7 +182,8 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
-void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9595ced393dc..49e4bc39e7bb 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_to_block(
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 39dd2b908106..e9371a8e0e26 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -366,11 +366,11 @@ struct xfs_bulkstat {
uint64_t bs_blocks; /* number of blocks */
uint64_t bs_xflags; /* extended flags */
- uint64_t bs_atime; /* access time, seconds */
- uint64_t bs_mtime; /* modify time, seconds */
+ int64_t bs_atime; /* access time, seconds */
+ int64_t bs_mtime; /* modify time, seconds */
- uint64_t bs_ctime; /* inode change time, seconds */
- uint64_t bs_btime; /* creation time, seconds */
+ int64_t bs_ctime; /* inode change time, seconds */
+ int64_t bs_btime; /* creation time, seconds */
uint32_t bs_gen; /* generation count */
uint32_t bs_uid; /* user id */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a08dd8f40346..ac6cdca63e15 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -928,7 +928,7 @@ xfs_log_sb(
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
- xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
}
/*
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index a43d1813c4ff..5533e48e605d 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -97,7 +97,6 @@ xchk_allocbt_rec(
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t bno;
xfs_extlen_t len;
- int error = 0;
bno = be32_to_cpu(rec->alloc.ar_startblock);
len = be32_to_cpu(rec->alloc.ar_blockcount);
@@ -109,7 +108,7 @@ xchk_allocbt_rec(
xchk_allocbt_xref(bs->sc, bno, len);
- return error;
+ return 0;
}
/* Scrub the freespace btrees for some AG. */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 93b3793bc5b3..0cab11a5d390 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -341,7 +341,6 @@ xchk_refcountbt_rec(
xfs_extlen_t len;
xfs_nlink_t refcount;
bool has_cowflag;
- int error = 0;
bno = be32_to_cpu(rec->refc.rc_startblock);
len = be32_to_cpu(rec->refc.rc_blockcount);
@@ -366,7 +365,7 @@ xchk_refcountbt_rec(
xchk_refcountbt_xref(bs->sc, bno, len, refcount);
- return error;
+ return 0;
}
/* Make sure we have as many refc blocks as the rmap says. */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0910cb75b65d..4f443703065e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -864,6 +864,7 @@ xfs_alloc_file_space(
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
+ xfs_fileoff_t endoffset_fsb;
int nimaps;
int quota_flag;
int rt;
@@ -891,7 +892,8 @@ xfs_alloc_file_space(
imapp = &imaps[0];
nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
- allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+ endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+ allocatesize_fsb = endoffset_fsb - startoffset_fsb;
/*
* Allocate file space until done or until there is an error
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 120ef99d09e8..0abba171aa89 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -345,6 +345,15 @@ xfs_buf_allocate_memory(
unsigned short page_count, i;
xfs_off_t start, end;
int error;
+ xfs_km_flags_t kmflag_mask = 0;
+
+ /*
+ * assure zeroed buffer for non-read cases.
+ */
+ if (!(flags & XBF_READ)) {
+ kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
+ }
/*
* for buffers that are contained within a single page, just allocate
@@ -354,7 +363,8 @@ xfs_buf_allocate_memory(
size = BBTOB(bp->b_length);
if (size < PAGE_SIZE) {
int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
- bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
+ bp->b_addr = kmem_alloc_io(size, align_mask,
+ KM_NOFS | kmflag_mask);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
@@ -2097,7 +2107,7 @@ xfs_verify_magic(
int idx;
idx = xfs_sb_version_hascrc(&mp->m_sb);
- if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+ if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
return false;
return dmagic == bp->b_ops->magic[idx];
}
@@ -2115,7 +2125,7 @@ xfs_verify_magic16(
int idx;
idx = xfs_sb_version_hascrc(&mp->m_sb);
- if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+ if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
return false;
return dmagic == bp->b_ops->magic16[idx];
}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a2beee9f74da..641d07f30a27 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1443,7 +1443,7 @@ xlog_alloc_log(
prev_iclog = iclog;
iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
- KM_MAYFAIL);
+ KM_MAYFAIL | KM_ZERO);
if (!iclog->ic_data)
goto out_free_iclog;
#ifdef DEBUG
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 508319039dce..c1a514ffff55 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -127,7 +127,7 @@ xlog_alloc_buffer(
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
}
/*
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index ddd0bf7a4740..f1bc88f4367c 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -63,19 +63,6 @@ static const struct sysfs_ops xfs_sysfs_ops = {
.store = xfs_sysfs_object_store,
};
-/*
- * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
- * that is identified by the fsname under sysfs.
- */
-
-static inline struct xfs_mount *
-to_mp(struct kobject *kobject)
-{
- struct xfs_kobj *kobj = to_kobj(kobject);
-
- return container_of(kobj, struct xfs_mount, m_kobj);
-}
-
static struct attribute *xfs_mp_attrs[] = {
NULL,
};