aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2019-11-11 08:34:59 +0100
committerIngo Molnar <mingo@kernel.org>2019-11-11 08:34:59 +0100
commit6d5a763c303bc9d78b17361d30b692ba2facf9b4 (patch)
tree1ba7059e8b1c47eb075acc906f8ef01f9c490ba1 /fs
parentleds: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM (diff)
parentLinux 5.4-rc7 (diff)
downloadlinux-dev-6d5a763c303bc9d78b17361d30b692ba2facf9b4.tar.xz
linux-dev-6d5a763c303bc9d78b17361d30b692ba2facf9b4.zip
Merge tag 'v5.4-rc7' into sched/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_elf.c13
-rw-r--r--fs/btrfs/block-group.c1
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/delalloc-space.c7
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/file.c56
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c30
-rw-r--r--fs/btrfs/ioctl.c12
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/space-info.c21
-rw-r--r--fs/btrfs/tree-checker.c8
-rw-r--r--fs/btrfs/tree-log.c36
-rw-r--r--fs/btrfs/volumes.c7
-rw-r--r--fs/ceph/caps.c10
-rw-r--r--fs/ceph/dir.c15
-rw-r--r--fs/ceph/file.c15
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/mds_client.c21
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/cifs/cifsfs.c30
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c22
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c62
-rw-r--r--fs/cifs/inode.c8
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/smb1ops.c3
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2ops.c3
-rw-r--r--fs/cifs/smb2pdu.c14
-rw-r--r--fs/cifs/smb2pdu.h1
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/transport.c42
-rw-r--r--fs/configfs/symlink.c2
-rw-r--r--fs/dax.c5
-rw-r--r--fs/direct-io.c3
-rw-r--r--fs/erofs/data.c10
-rw-r--r--fs/erofs/super.c4
-rw-r--r--fs/erofs/zdata.c12
-rw-r--r--fs/fs-writeback.c20
-rw-r--r--fs/fuse/Makefile3
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/dir.c16
-rw-r--r--fs/fuse/file.c14
-rw-r--r--fs/fuse/fuse_i.h4
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/fuse/virtio_fs.c169
-rw-r--r--fs/gfs2/ops_fstype.c21
-rw-r--r--fs/io_uring.c368
-rw-r--r--fs/libfs.c140
-rw-r--r--fs/nfs/delegation.c12
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/direct.c106
-rw-r--r--fs/nfs/nfs4proc.c8
-rw-r--r--fs/nfs/write.c5
-rw-r--r--fs/ocfs2/aops.c25
-rw-r--r--fs/ocfs2/file.c136
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.c3
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/xattr.c56
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/page.c28
-rw-r--r--fs/readdir.c128
-rw-r--r--fs/statfs.c17
-rw-r--r--fs/super.c5
-rw-r--r--fs/tracefs/inode.c46
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/scrub/refcount.c3
-rw-r--r--fs/xfs/xfs_bmap_util.c4
-rw-r--r--fs/xfs/xfs_buf.c12
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
83 files changed, 1152 insertions, 807 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ad4c6b1d5074..c5642bcb6b46 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -879,7 +879,7 @@ out_free_interp:
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
- int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
+ int elf_prot, elf_flags;
unsigned long k, vaddr;
unsigned long total_size = 0;
@@ -911,13 +911,6 @@ out_free_interp:
*/
}
}
-
- /*
- * Some binaries have overlapping elf segments and then
- * we have to forcefully map over an existing mapping
- * e.g. over this newly established brk mapping.
- */
- elf_fixed = MAP_FIXED;
}
elf_prot = make_prot(elf_ppnt->p_flags);
@@ -930,7 +923,7 @@ out_free_interp:
* the ET_DYN load_addr calculations, proceed normally.
*/
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
- elf_flags |= elf_fixed;
+ elf_flags |= MAP_FIXED;
} else if (loc->elf_ex.e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
@@ -966,7 +959,7 @@ out_free_interp:
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
- elf_flags |= elf_fixed;
+ elf_flags |= MAP_FIXED;
} else
load_bias = 0;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bf7e3f23bba7..670700cb1110 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1761,6 +1761,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_err(info,
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
cache->key.objectid);
+ btrfs_put_block_group(cache);
ret = -EINVAL;
goto error;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 19d669d12ca1..fe2b8765d9e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -734,8 +734,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *fixup_workers;
struct btrfs_workqueue *delayed_workers;
- /* the extent workers do delayed refs on the extent allocation tree */
- struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
u32 thread_pool_size;
@@ -2489,8 +2487,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index d949d7d2abed..db9f2c58eb4a 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -381,7 +381,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
out_qgroup:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
out_fail:
- btrfs_inode_rsv_release(inode, true);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
@@ -418,7 +417,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -426,8 +424,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned num_extents;
@@ -441,7 +438,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode, qgroup_free);
+ btrfs_inode_rsv_release(inode, true);
}
/**
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 044981cf6df9..402b61bf345c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2008,7 +2008,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
- btrfs_destroy_workqueue(fs_info->extent_workers);
/*
* Now that all other work queues are destroyed, we can safely destroy
* the queues used for metadata I/O, since tasks from those other work
@@ -2214,10 +2213,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
- fs_info->extent_workers =
- btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 8);
if (!(fs_info->workers && fs_info->delalloc_workers &&
fs_info->submit_workers && fs_info->flush_workers &&
@@ -2228,7 +2223,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
return -ENOMEM;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8fe4eb7e5045..435a502a3226 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
- struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
@@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
+ struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
@@ -1692,7 +1692,7 @@ again:
force_page_uptodate);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
break;
}
@@ -1704,7 +1704,7 @@ again:
if (extents_locked == -EAGAIN)
goto again;
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
ret = extents_locked;
break;
}
@@ -1758,11 +1758,21 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, &cached_state);
+
+ /*
+ * If we have not locked the extent range, because the range's
+ * start offset is >= i_size, we might still have a non-NULL
+ * cached extent state, acquired while marking the extent range
+ * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * possible cached extent state to avoid a memory leak.
+ */
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
- true);
+ else
+ free_extent_state(cached_state);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
@@ -2057,25 +2067,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
- u64 len;
- /*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges, and assertion failures during
- * hole detection.
- */
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
-
- /*
- * The range length can be represented by u64, we have to do the typecasts
- * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
- */
- len = (u64)end - (u64)start + 1;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
@@ -2102,6 +2094,19 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
+ * If the inode needs a full sync, make sure we use a full range to
+ * avoid log tree corruption, due to hole detection racing with ordered
+ * extent completion for adjacent ranges, and assertion failures during
+ * hole detection. Do this while holding the inode lock, to avoid races
+ * with other tasks.
+ */
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
* any such pages does not start while we are logging the inode, because
@@ -2128,8 +2133,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
/*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
+ *
+ * Also, the range length can be represented by u64, we have to do the
+ * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
*/
- ret = btrfs_wait_ordered_range(inode, start, len);
+ ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
if (ret) {
up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 63cad7865d75..37345fb6191d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -501,13 +501,13 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0546401bc0a..c6dc4dd16cf7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -474,6 +474,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
+ u64 i_size;
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -488,7 +489,19 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, i_size_read(inode), end + 1);
+ /*
+ * We need to save i_size before now because it could change in between
+ * us evaluating the size and assigning it. This is because we lock and
+ * unlock the page in truncate and fallocate, and then modify the i_size
+ * later on.
+ *
+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
+ * does that for us.
+ */
+ barrier();
+ i_size = i_size_read(inode);
+ barrier();
+ actual_end = min_t(u64, i_size, end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -2206,7 +2219,7 @@ again:
ClearPageChecked(page);
set_page_dirty(page);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -4951,7 +4964,7 @@ again:
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
goto out;
}
@@ -5018,7 +5031,7 @@ out_unlock:
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
@@ -6305,13 +6318,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
u32 sizes[2];
int nitems = name ? 2 : 1;
unsigned long ptr;
+ unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
+ nofs_flag = memalloc_nofs_save();
inode = new_inode(fs_info->sb);
+ memalloc_nofs_restore(nofs_flag);
if (!inode) {
btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
@@ -8706,7 +8722,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
out:
if (wakeup)
@@ -9056,7 +9072,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
if (!ret2) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
@@ -9065,7 +9081,7 @@ again:
out_unlock:
unlock_page(page);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index de730e56d3f5..23272d9154f3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1360,8 +1360,7 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
out:
@@ -1372,8 +1371,7 @@ out:
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
@@ -4197,9 +4195,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
u64 transid;
int ret;
- btrfs_warn(root->fs_info,
- "START_SYNC ioctl is deprecated and will be removed in kernel 5.7");
-
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
@@ -4227,9 +4222,6 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
{
u64 transid;
- btrfs_warn(fs_info,
- "WAIT_SYNC ioctl is deprecated and will be removed in kernel 5.7");
-
if (argp) {
if (copy_from_user(&transid, argp, sizeof(transid)))
return -EFAULT;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c4bb69941c77..3ad151655eb8 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3629,7 +3629,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
+ trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
@@ -3676,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
+ trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
num_bytes, type);
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e87cbdad02a3..b57f3618e58e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret;
+ int i = 0, tree_block_level = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 00504657b602..5cd42b66818c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3277,6 +3277,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!page) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ PAGE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -3297,7 +3299,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
ret = -EIO;
goto out;
}
@@ -3326,7 +3328,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
clear_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
@@ -3342,8 +3344,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f3215028235c..123ac54af071 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5085,7 +5085,7 @@ static int clone_range(struct send_ctx *sctx,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- u64 clone_src_i_size;
+ u64 clone_src_i_size = 0;
/*
* Prevent cloning from a zero offset with a length matching the sector
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 98dc092a905e..e8a4b0ebe97f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -893,6 +893,15 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
while (ticket->bytes > 0 && ticket->error == 0) {
ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
if (ret) {
+ /*
+ * Delete us from the list. After we unlock the space
+ * info, we don't want the async reclaim job to reserve
+ * space for this ticket. If that would happen, then the
+ * ticket's task would not known that space was reserved
+ * despite getting an error, resulting in a space leak
+ * (bytes_may_use counter of our space_info).
+ */
+ list_del_init(&ticket->list);
ticket->error = -EINTR;
break;
}
@@ -945,12 +954,24 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
ret = ticket->error;
if (ticket->bytes || ticket->error) {
+ /*
+ * Need to delete here for priority tickets. For regular tickets
+ * either the async reclaim job deletes the ticket from the list
+ * or we delete it ourselves at wait_reserve_ticket().
+ */
list_del_init(&ticket->list);
if (!ret)
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
ASSERT(list_empty(&ticket->list));
+ /*
+ * Check that we can't have an error set if the reservation succeeded,
+ * as that would confuse tasks and lead them to error out without
+ * releasing reserved space (if an error happens the expectation is that
+ * space wasn't reserved at all).
+ */
+ ASSERT(!(ticket->bytes == 0 && ticket->error));
return ret;
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43e488f5d063..076d5b8014fb 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -686,9 +686,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot,
static int check_dev_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
- struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dev_item *ditem;
- u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK);
if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
dev_item_err(leaf, slot,
@@ -696,12 +694,6 @@ static int check_dev_item(struct extent_buffer *leaf,
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
- if (key->offset > max_devid) {
- dev_item_err(leaf, slot,
- "invalid devid: has=%llu expect=[0, %llu]",
- key->offset, max_devid);
- return -EUCLEAN;
- }
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
if (btrfs_device_id(leaf, ditem) != key->offset) {
dev_item_err(leaf, slot,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 29b82a795522..8a6cc600bf18 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2932,7 +2932,8 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans,
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -3104,18 +3106,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
- * Update or create log root item under the root's log_mutex to prevent
- * races with concurrent log syncs that can lead to failure to update
- * log root item because it was not created yet.
- */
- ret = update_log_root(trans, log);
- /*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
@@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&log_root_tree->log_writer_wait);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cdd7af424033..e04409f85063 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3845,7 +3845,11 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return is_power_of_2(flags);
+ /*
+ * Don't use is_power_of_2(unsigned long) because it won't work
+ * for the single profile (1ULL << 48) on 32-bit CPUs.
+ */
+ return flags != 0 && (flags & (flags - 1)) == 0;
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -4963,6 +4967,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
+ devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d3b9c9d5c1bd..f5a38910a82b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1058,6 +1058,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ /* remove from inode's cap rbtree, and clear auth cap */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
/* remove from session list */
spin_lock(&session->s_cap_lock);
if (session->s_cap_iterator == cap) {
@@ -1091,11 +1096,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
spin_unlock(&session->s_cap_lock);
- /* remove from inode list */
- rb_erase(&cap->ci_node, &ci->i_caps);
- if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
if (removed)
ceph_put_cap(mdsc, cap);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4ca0b8ff9a72..d17a789fd856 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1553,36 +1553,37 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int valid = 0;
struct dentry *parent;
- struct inode *dir;
+ struct inode *dir, *inode;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
+ inode = d_inode_rcu(dentry);
} else {
parent = dget_parent(dentry);
dir = d_inode(parent);
+ inode = d_inode(dentry);
}
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
- dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
+ dentry, inode, ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
- dentry, d_inode(dentry));
+ dentry, inode);
valid = 1;
- } else if (d_really_is_positive(dentry) &&
- ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
+ } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
valid = 1;
} else {
valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
if (valid || dir_lease_is_valid(dir, dentry)) {
- if (d_really_is_positive(dentry))
- valid = ceph_is_any_caps(d_inode(dentry));
+ if (inode)
+ valid = ceph_is_any_caps(inode);
else
valid = 1;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d277f71abe0b..bd77adb64bfd 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -462,6 +462,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ } else if (!d_in_lookup(dentry)) {
+ /* If it's not being looked up, it's negative */
+ return -ENOENT;
}
/* do the open */
@@ -1956,10 +1959,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
return -EOPNOTSUPP;
+ /*
+ * Striped file layouts require that we copy partial objects, but the
+ * OSD copy-from operation only supports full-object copies. Limit
+ * this to non-striped file layouts for now.
+ */
if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
- (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) ||
- (src_ci->i_layout.object_size != dst_ci->i_layout.object_size))
+ (src_ci->i_layout.stripe_count != 1) ||
+ (dst_ci->i_layout.stripe_count != 1) ||
+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
+ dout("Invalid src/dst files layout\n");
return -EOPNOTSUPP;
+ }
if (len < src_ci->i_layout.object_size)
return -EOPNOTSUPP; /* no remote copy will be done */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9f135624ae47..c07407586ce8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1434,6 +1434,7 @@ retry_lookup:
dout(" final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct inode *dir = req->r_parent;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a8a8f84f3bbf..a5163296d9d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -384,8 +384,8 @@ static int parse_reply_info_readdir(void **p, void *end,
}
done:
- if (*p != end)
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
bad:
@@ -406,12 +406,10 @@ static int parse_reply_info_filelock(void **p, void *end,
goto bad;
info->filelock_reply = *p;
- *p += sizeof(*info->filelock_reply);
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
-
bad:
return -EIO;
}
@@ -425,18 +423,21 @@ static int parse_reply_info_create(void **p, void *end,
{
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
+ /* Malformed reply? */
if (*p == end) {
info->has_create_ino = false;
} else {
info->has_create_ino = true;
- info->ino = ceph_decode_64(p);
+ ceph_decode_64_safe(p, end, info->ino, bad);
}
+ } else {
+ if (*p != end)
+ goto bad;
}
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
-
bad:
return -EIO;
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index edfd643a8205..b47f43fc2d68 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -268,6 +268,7 @@ static int parse_fsopt_token(char *c, void *private)
}
break;
case Opt_fscache_uniq:
+#ifdef CONFIG_CEPH_FSCACHE
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from,
argstr[0].to-argstr[0].from,
@@ -276,7 +277,10 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
break;
- /* misc */
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_wsize:
if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
return -EINVAL;
@@ -353,10 +357,15 @@ static int parse_fsopt_token(char *c, void *private)
fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
break;
case Opt_fscache:
+#ifdef CONFIG_CEPH_FSCACHE
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = NULL;
break;
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
kfree(fsopt->fscache_uniq);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2e9c7f493f99..1a135d1b85bd 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -169,18 +169,32 @@ cifs_read_super(struct super_block *sb)
else
sb->s_maxbytes = MAX_NON_LFS;
- /* BB FIXME fix time_gran to be larger for LANMAN sessions */
- sb->s_time_gran = 100;
-
- if (tcon->unix_ext) {
- ts = cifs_NTtimeToUnix(0);
+ /*
+ * Some very old servers like DOS and OS/2 used 2 second granularity
+ * (while all current servers use 100ns granularity - see MS-DTYP)
+ * but 1 second is the maximum allowed granularity for the VFS
+ * so for old servers set time granularity to 1 second while for
+ * everything else (current servers) set it to 100ns.
+ */
+ if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) &&
+ ((tcon->ses->capabilities &
+ tcon->ses->server->vals->cap_nt_find) == 0) &&
+ !tcon->unix_ext) {
+ sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */
+ ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
sb->s_time_min = ts.tv_sec;
- ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
+ ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX),
+ cpu_to_le16(SMB_TIME_MAX), 0);
sb->s_time_max = ts.tv_sec;
} else {
- ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
+ /*
+ * Almost every server, including all SMB2+, uses DCE TIME
+ * ie 100 nanosecond units, since 1601. See MS-DTYP and MS-FSCC
+ */
+ sb->s_time_gran = 100;
+ ts = cifs_NTtimeToUnix(0);
sb->s_time_min = ts.tv_sec;
- ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), cpu_to_le16(SMB_TIME_MAX), 0);
+ ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
sb->s_time_max = ts.tv_sec;
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 2e960e1049db..d78bfcc19156 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1210,7 +1210,7 @@ struct cifs_search_info {
bool smallBuf:1; /* so we know which buf_release function to call */
};
-#define ACL_NO_MODE -1
+#define ACL_NO_MODE ((umode_t)(-1))
struct cifs_open_parms {
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
@@ -1391,6 +1391,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
struct cifsInodeInfo {
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
+ /*
+ * NOTE: Some code paths call down_read(lock_sem) twice, so
+ * we must always use use cifs_down_write() instead of down_write()
+ * for this semaphore to avoid deadlocks.
+ */
struct rw_semaphore lock_sem; /* protect the fields above */
/* BB add in lists for dirty pages i.e. write caching info for oplock */
struct list_head openFileList;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e53e9f62b87b..fe597d3d5208 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -170,6 +170,7 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
+extern void cifs_down_write(struct rw_semaphore *sem);
extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
struct file *file,
struct tcon_link *tlink,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2850c3ce4391..ccaa8bad336f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -564,9 +564,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+ kref_get(&mid_entry->refcount);
if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
mid_entry->mid_state = MID_RETRY_NEEDED;
list_move(&mid_entry->qhead, &retry_list);
+ mid_entry->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
mutex_unlock(&server->srv_mutex);
@@ -576,6 +578,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
+ cifs_mid_q_entry_release(mid_entry);
}
if (cifs_rdma_enabled(server)) {
@@ -895,8 +898,10 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
if (mid->mid_flags & MID_DELETED)
printk_once(KERN_WARNING
"trying to dequeue a deleted mid\n");
- else
+ else {
list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
spin_unlock(&GlobalMid_Lock);
}
@@ -966,8 +971,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
+ kref_get(&mid_entry->refcount);
mid_entry->mid_state = MID_SHUTDOWN;
list_move(&mid_entry->qhead, &dispose_list);
+ mid_entry->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
@@ -977,6 +984,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
+ cifs_mid_q_entry_release(mid_entry);
}
/* 1/8th of sec is more than enough time for them to exit */
msleep(125);
@@ -3882,8 +3890,12 @@ generic_ip_connect(struct TCP_Server_Info *server)
rc = socket->ops->connect(socket, saddr, slen,
server->noblockcnt ? O_NONBLOCK : 0);
-
- if (rc == -EINPROGRESS)
+ /*
+ * When mounting SMB root file systems, we do not want to block in
+ * connect. Otherwise bail out and then let cifs_reconnect() perform
+ * reconnect failover - if possible.
+ */
+ if (server->noblockcnt && rc == -EINPROGRESS)
rc = 0;
if (rc < 0) {
cifs_dbg(FYI, "Error %d connecting to server\n", rc);
@@ -4264,7 +4276,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
server->ops->qfs_tcon(*xid, tcon);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
if (tcon->fsDevInfo.DeviceCharacteristics &
- FILE_READ_ONLY_DEVICE)
+ cpu_to_le32(FILE_READ_ONLY_DEVICE))
cifs_dbg(VFS, "mounted to read only share\n");
else if ((cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_RW_CACHE) == 0)
@@ -4445,7 +4457,7 @@ static int setup_dfs_tgt_conn(const char *path,
int rc;
struct dfs_info3_param ref = {0};
char *mdata = NULL, *fake_devname = NULL;
- struct smb_vol fake_vol = {0};
+ struct smb_vol fake_vol = {NULL};
cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index dd5ac841aefa..7ce689d31aa2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -738,10 +738,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
static int
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
+ struct inode *inode;
+
if (flags & LOOKUP_RCU)
return -ECHILD;
if (d_really_is_positive(direntry)) {
+ inode = d_inode(direntry);
+ if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
+ CIFS_I(inode)->time = 0; /* force reval */
+
if (cifs_revalidate_dentry(direntry))
return 0;
else {
@@ -752,7 +758,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
* attributes will have been updated by
* cifs_revalidate_dentry().
*/
- if (IS_AUTOMOUNT(d_inode(direntry)) &&
+ if (IS_AUTOMOUNT(inode) &&
!(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
spin_lock(&direntry->d_lock);
direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4b95700c507c..fa7b0fa72bb3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -253,6 +253,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
xid, fid);
+ if (rc) {
+ server->ops->close(xid, tcon, fid);
+ if (rc == -ESTALE)
+ rc = -EOPENSTALE;
+ }
+
out:
kfree(buf);
return rc;
@@ -275,6 +281,13 @@ cifs_has_mand_locks(struct cifsInodeInfo *cinode)
return has_locks;
}
+void
+cifs_down_write(struct rw_semaphore *sem)
+{
+ while (!down_write_trylock(sem))
+ msleep(10);
+}
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -300,7 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
INIT_LIST_HEAD(&fdlocks->locks);
fdlocks->cfile = cfile;
cfile->llist = fdlocks;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_add(&fdlocks->llist, &cinode->llist);
up_write(&cinode->lock_sem);
@@ -399,10 +412,11 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
bool oplock_break_cancelled;
spin_lock(&tcon->open_file_lock);
-
+ spin_lock(&cifsi->open_file_lock);
spin_lock(&cifs_file->file_info_lock);
if (--cifs_file->count > 0) {
spin_unlock(&cifs_file->file_info_lock);
+ spin_unlock(&cifsi->open_file_lock);
spin_unlock(&tcon->open_file_lock);
return;
}
@@ -415,9 +429,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open);
/* remove it from the lists */
- spin_lock(&cifsi->open_file_lock);
list_del(&cifs_file->flist);
- spin_unlock(&cifsi->open_file_lock);
list_del(&cifs_file->tlist);
atomic_dec(&tcon->num_local_opens);
@@ -434,6 +446,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
cifs_set_oplock_level(cifsi, 0);
}
+ spin_unlock(&cifsi->open_file_lock);
spin_unlock(&tcon->open_file_lock);
oplock_break_cancelled = wait_oplock_handler ?
@@ -458,7 +471,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
*/
- down_write(&cifsi->lock_sem);
+ cifs_down_write(&cifsi->lock_sem);
list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
list_del(&li->llist);
cifs_del_lock_waiters(li);
@@ -1021,7 +1034,7 @@ static void
cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
{
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
}
@@ -1043,7 +1056,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
try_again:
exist = false;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
lock->type, lock->flags, &conf_lock,
@@ -1066,7 +1079,7 @@ try_again:
(lock->blist.next == &lock->blist));
if (!rc)
goto try_again;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_del_init(&lock->blist);
}
@@ -1119,7 +1132,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
return rc;
try_again:
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1325,7 +1338,7 @@ cifs_push_locks(struct cifsFileInfo *cfile)
int rc = 0;
/* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1516,7 +1529,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
if (!buf)
return -ENOMEM;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
for (i = 0; i < 2; i++) {
cur = buf;
num = 0;
@@ -1840,13 +1853,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
{
struct cifsFileInfo *open_file = NULL;
struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
/* we could simply get the first_list_entry since write-only entries
are always at the end of the list but since the first entry might
have a close pending, we go through the whole list */
@@ -1858,7 +1870,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
/* found a good file */
/* lock it so it will not be closed on us */
cifsFileInfo_get(open_file);
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return open_file;
} /* else might as well continue, and look for
another, or simply have the caller reopen it
@@ -1866,7 +1878,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
} else /* write only file */
break; /* write only files are last so must be done */
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return NULL;
}
@@ -1877,7 +1889,6 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
bool any_available = false;
int rc = -EBADF;
unsigned int refind = 0;
@@ -1897,16 +1908,15 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
- tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
refind_writable:
if (refind > MAX_REOPEN_ATT) {
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return rc;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
@@ -1918,7 +1928,7 @@ refind_writable:
if (!open_file->invalidHandle) {
/* found a good writable file */
cifsFileInfo_get(open_file);
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
*ret_file = open_file;
return 0;
} else {
@@ -1938,7 +1948,7 @@ refind_writable:
cifsFileInfo_get(inv_file);
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
@@ -1953,7 +1963,7 @@ refind_writable:
cifsFileInfo_put(inv_file);
++refind;
inv_file = NULL;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
goto refind_writable;
}
@@ -4461,17 +4471,15 @@ static int cifs_readpage(struct file *file, struct page *page)
static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *open_file;
- struct cifs_tcon *tcon =
- cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb));
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return 1;
}
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return 0;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3bae2e53f0b8..df9377828e2f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -414,6 +414,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -421,6 +422,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if filetype is different, return error */
if (unlikely(((*pinode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -933,6 +935,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
@@ -940,6 +943,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if filetype is different, return error */
if (unlikely(((*inode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
@@ -2471,9 +2475,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid);
cifsFileInfo_put(wfile);
if (rc)
- return rc;
+ goto cifs_setattr_exit;
} else if (rc != -EBADF)
- return rc;
+ goto cifs_setattr_exit;
else
rc = 0;
}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 49c17ee18254..9b41436fb8db 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
{0, 0}
};
-static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
- {0, 0}
-};
-
/*
* Convert a string containing text IPv4 or IPv6 address to binary form.
*
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index b7421a096319..514810694c0f 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -171,6 +171,9 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
/* we do not want to loop forever */
last_mid = cur_mid;
cur_mid++;
+ /* avoid 0xFFFF MID */
+ if (cur_mid == 0xffff)
+ cur_mid++;
/*
* This nested loop looks more expensive than it is.
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index e6a1fc72018f..8b0b512c5792 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -145,7 +145,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
cur = buf;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
if (flock->fl_start > li->offset ||
(flock->fl_start + length) <
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4c0922596467..cd55af9b7cc5 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4084,6 +4084,7 @@ free_pages:
kfree(dw->ppages);
cifs_small_buf_release(dw->buf);
+ kfree(dw);
}
@@ -4157,7 +4158,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
dw->server = server;
dw->ppages = pages;
dw->len = len;
- queue_work(cifsiod_wq, &dw->decrypt);
+ queue_work(decrypt_wq, &dw->decrypt);
*num_mids = 0; /* worker thread takes care of finding mid */
return -1;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 85f9d614d968..05149862aea4 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -751,8 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
unsigned int num = *num_iovec;
iov[num].iov_base = create_posix_buf(mode);
- if (mode == -1)
- cifs_dbg(VFS, "illegal mode\n"); /* BB REMOVEME */
+ if (mode == ACL_NO_MODE)
+ cifs_dbg(FYI, "illegal mode\n");
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_posix);
@@ -2521,11 +2521,8 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock,
return rc;
}
- /* TODO: add handling for the mode on create */
- if (oparms->disposition == FILE_CREATE)
- cifs_dbg(VFS, "mode is 0x%x\n", oparms->mode); /* BB REMOVEME */
-
- if ((oparms->disposition == FILE_CREATE) && (oparms->mode != -1)) {
+ if ((oparms->disposition == FILE_CREATE) &&
+ (oparms->mode != ACL_NO_MODE)) {
if (n_iov > 2) {
struct create_context *ccontext =
(struct create_context *)iov[n_iov-1].iov_base;
@@ -3217,7 +3214,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
- req->OutputBufferLength = SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE;
+ req->OutputBufferLength =
+ cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
req->CompletionFilter = cpu_to_le32(completion_filter);
if (watch_tree)
req->Flags = cpu_to_le16(SMB2_WATCH_TREE);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index ea735d59c36e..0abfde6d0b05 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -838,6 +838,7 @@ struct create_durable_handle_reconnect_v2 {
struct create_context ccontext;
__u8 Name[8];
struct durable_reconnect_context_v2 dcontext;
+ __u8 Pad[4];
} __packed;
/* See MS-SMB2 2.2.13.2.5 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index da3a6d580808..71b2930b8e0b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -150,6 +150,10 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
bool is_fsctl, char *in_data, u32 indatalen,
__u32 max_response_size);
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
+extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, bool watch_tree,
+ u32 completion_filter);
+
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 308ad0f495e1..ca3de62688d6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -86,22 +86,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
static void _cifs_mid_q_entry_release(struct kref *refcount)
{
- struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry,
- refcount);
-
- mempool_free(mid, cifs_mid_poolp);
-}
-
-void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
-{
- spin_lock(&GlobalMid_Lock);
- kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
- spin_unlock(&GlobalMid_Lock);
-}
-
-void
-DeleteMidQEntry(struct mid_q_entry *midEntry)
-{
+ struct mid_q_entry *midEntry =
+ container_of(refcount, struct mid_q_entry, refcount);
#ifdef CONFIG_CIFS_STATS2
__le16 command = midEntry->server->vals->lock_cmd;
__u16 smb_cmd = le16_to_cpu(midEntry->command);
@@ -166,6 +152,19 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
}
}
#endif
+
+ mempool_free(midEntry, cifs_mid_poolp);
+}
+
+void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
+{
+ spin_lock(&GlobalMid_Lock);
+ kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
+ spin_unlock(&GlobalMid_Lock);
+}
+
+void DeleteMidQEntry(struct mid_q_entry *midEntry)
+{
cifs_mid_q_entry_release(midEntry);
}
@@ -173,8 +172,10 @@ void
cifs_delete_mid(struct mid_q_entry *mid)
{
spin_lock(&GlobalMid_Lock);
- list_del_init(&mid->qhead);
- mid->mid_flags |= MID_DELETED;
+ if (!(mid->mid_flags & MID_DELETED)) {
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
spin_unlock(&GlobalMid_Lock);
DeleteMidQEntry(mid);
@@ -872,7 +873,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
rc = -EHOSTDOWN;
break;
default:
- list_del_init(&mid->qhead);
+ if (!(mid->mid_flags & MID_DELETED)) {
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
__func__, mid->mid, mid->mid_state);
rc = -EIO;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index dc5dbf6a81d7..cb61467478ca 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -101,7 +101,7 @@ static int create_link(struct config_item *parent_item,
}
target_sd->s_links++;
spin_unlock(&configfs_dirent_lock);
- ret = configfs_get_target_path(item, item, body);
+ ret = configfs_get_target_path(parent_item, item, body);
if (!ret)
ret = configfs_create_link(target_sd, parent_item->ci_dentry,
dentry, body);
diff --git a/fs/dax.c b/fs/dax.c
index 6bf81f931de3..2cc43cd914eb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -220,10 +220,11 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
for (;;) {
entry = xas_find_conflict(xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ return entry;
if (dax_entry_order(entry) < order)
return XA_RETRY_ENTRY;
- if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
- !dax_is_locked(entry))
+ if (!dax_is_locked(entry))
return entry;
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ae196784f487..9329ced91f1d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -241,9 +241,8 @@ void dio_warn_stale_pagecache(struct file *filp)
}
}
-/**
+/*
* dio_complete() - called when all DIO BIO I/O has been completed
- * @offset: the byte offset in the file of the completed operation
*
* This drops i_dio_count, lets interested parties know that a DIO operation
* has completed, and calculates the resulting return code for the operation.
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8a9fcbd0e8ac..fc3a8d8064f8 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -34,11 +34,15 @@ static void erofs_readendio(struct bio *bio)
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
{
- struct inode *const bd_inode = sb->s_bdev->bd_inode;
- struct address_space *const mapping = bd_inode->i_mapping;
+ struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct page *page;
- return read_cache_page_gfp(mapping, blkaddr,
+ page = read_cache_page_gfp(mapping, blkaddr,
mapping_gfp_constraint(mapping, ~__GFP_FS));
+ /* should already be PageUptodate */
+ if (!IS_ERR(page))
+ lock_page(page);
+ return page;
}
static int erofs_map_blocks_flatmode(struct inode *inode,
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index caf9a95173b0..0e369494f2f2 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -105,9 +105,9 @@ static int erofs_read_superblock(struct super_block *sb)
int ret;
page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
- if (!page) {
+ if (IS_ERR(page)) {
erofs_err(sb, "cannot read erofs superblock");
- return -EIO;
+ return PTR_ERR(page);
}
sbi = EROFS_SB(sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 96e34c90f814..fad80c97d247 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -575,7 +575,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct erofs_map_blocks *const map = &fe->map;
struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
- bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED);
+ bool tight = true;
enum z_erofs_cache_alloctype cache_strategy;
enum z_erofs_page_type page_type;
@@ -628,8 +628,16 @@ restart_now:
preload_compressed_pages(clt, MNGD_MAPPING(sbi),
cache_strategy, pagepool);
- tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED);
hitted:
+ /*
+ * Ensure the current partial page belongs to this submit chain rather
+ * than other concurrent submit chains or the noio(bypass) chain since
+ * those chains are handled asynchronously thus the page cannot be used
+ * for inplace I/O or pagevec (should be processed in strict order.)
+ */
+ tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
+ clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8aaa7eec7b74..335607b8c5c0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb,
if (work->auto_free)
kfree(work);
- if (done && atomic_dec_and_test(&done->cnt))
- wake_up_all(done->waitq);
+ if (done) {
+ wait_queue_head_t *waitq = done->waitq;
+
+ /* @done can't be accessed after the following dec */
+ if (atomic_dec_and_test(&done->cnt))
+ wake_up_all(waitq);
+ }
}
static void wb_queue_work(struct bdi_writeback *wb,
@@ -571,10 +576,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
spin_unlock(&inode->i_lock);
/*
- * A dying wb indicates that the memcg-blkcg mapping has changed
- * and a new wb is already serving the memcg. Switch immediately.
+ * A dying wb indicates that either the blkcg associated with the
+ * memcg changed or the associated memcg is dying. In the first
+ * case, a replacement wb should already be available and we should
+ * refresh the wb immediately. In the second case, trying to
+ * refresh will keep failing.
*/
- if (unlikely(wb_dying(wbc->wb)))
+ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
@@ -900,7 +908,7 @@ restart:
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
* @bdi_id: target bdi id
* @memcg_id: target memcg css id
- * @nr_pages: number of pages to write, 0 for best-effort dirty flushing
+ * @nr: number of pages to write, 0 for best-effort dirty flushing
* @reason: reason why some writeback work initiated
* @done: target wb_completion
*
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 6419a2b3510d..3e8cebfb59b7 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,6 +5,7 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
-obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
+obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+virtiofs-y += virtio_fs.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index dadd617d826c..ed1abc9e33cf 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -276,10 +276,12 @@ static void flush_bg_queue(struct fuse_conn *fc)
void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_iqueue *fiq = &fc->iq;
- bool async = req->args->end;
+ bool async;
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
+
+ async = req->args->end;
/*
* test_and_set_bit() implies smp_mb() between bit
* changing and below intr_entry check. Pairs with
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d572c900bb0f..54d638f9ba1c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -405,7 +405,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
else
fuse_invalidate_entry_cache(entry);
- fuse_advise_use_readdirplus(dir);
+ if (inode)
+ fuse_advise_use_readdirplus(dir);
return newent;
out_iput:
@@ -1521,6 +1522,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
is_truncate = true;
}
+ /* Flush dirty data/metadata before non-truncate SETATTR */
+ if (is_wb && S_ISREG(inode->i_mode) &&
+ attr->ia_valid &
+ (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
+ ATTR_TIMES_SET)) {
+ err = write_inode_now(inode, true);
+ if (err)
+ return err;
+
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+ }
+
if (is_truncate) {
fuse_set_nowrite(inode);
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0f0225686aee..db48a5cf8620 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -217,7 +217,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- bool lock_inode = (file->f_flags & O_TRUNC) &&
+ bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
@@ -225,16 +225,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
- if (lock_inode)
+ if (is_wb_truncate) {
inode_lock(inode);
+ fuse_set_nowrite(inode);
+ }
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (lock_inode)
+ if (is_wb_truncate) {
+ fuse_release_nowrite(inode);
inode_unlock(inode);
+ }
return err;
}
@@ -1997,7 +2001,7 @@ static int fuse_writepages_fill(struct page *page,
if (!data->ff) {
err = -EIO;
- data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+ data->ff = fuse_write_file_get(fc, fi);
if (!data->ff)
goto out_unlock;
}
@@ -2042,8 +2046,6 @@ static int fuse_writepages_fill(struct page *page,
* under writeback, so we can release the page lock.
*/
if (data->wpa == NULL) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
err = -ENOMEM;
wpa = fuse_writepage_args_alloc();
if (!wpa) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 956aeaf961ae..d148188cfca4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -479,6 +479,7 @@ struct fuse_fs_context {
bool destroy:1;
bool no_control:1;
bool no_force_umount:1;
+ bool no_mount_options:1;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
@@ -713,6 +714,9 @@ struct fuse_conn {
/** Do not allow MNT_FORCE umount */
unsigned int no_force_umount:1;
+ /* Do not show mount options */
+ unsigned int no_mount_options:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e040e2a2b621..16aec32f7f3d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -558,6 +558,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
+ if (fc->no_mount_options)
+ return 0;
+
seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
if (fc->default_permissions)
@@ -1180,6 +1183,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
fc->destroy = ctx->destroy;
fc->no_control = ctx->no_control;
fc->no_force_umount = ctx->no_force_umount;
+ fc->no_mount_options = ctx->no_mount_options;
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 6af3f131e468..a5c86048b96e 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -30,6 +30,7 @@ struct virtio_fs_vq {
struct virtqueue *vq; /* protected by ->lock */
struct work_struct done_work;
struct list_head queued_reqs;
+ struct list_head end_reqs; /* End these requests */
struct delayed_work dispatch_work;
struct fuse_dev *fud;
bool connected;
@@ -54,6 +55,9 @@ struct virtio_fs_forget {
struct list_head list;
};
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+ struct fuse_req *req, bool in_flight);
+
static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
{
struct virtio_fs *fs = vq->vdev->priv;
@@ -66,6 +70,19 @@ static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
return &vq_to_fsvq(vq)->fud->pq;
}
+/* Should be called with fsvq->lock held. */
+static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+ fsvq->in_flight++;
+}
+
+/* Should be called with fsvq->lock held. */
+static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+ WARN_ON(fsvq->in_flight <= 0);
+ fsvq->in_flight--;
+}
+
static void release_virtio_fs_obj(struct kref *ref)
{
struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
@@ -109,22 +126,6 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
flush_delayed_work(&fsvq->dispatch_work);
}
-static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq)
-{
- struct virtio_fs_forget *forget;
-
- spin_lock(&fsvq->lock);
- while (1) {
- forget = list_first_entry_or_null(&fsvq->queued_reqs,
- struct virtio_fs_forget, list);
- if (!forget)
- break;
- list_del(&forget->list);
- kfree(forget);
- }
- spin_unlock(&fsvq->lock);
-}
-
static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
{
struct virtio_fs_vq *fsvq;
@@ -132,9 +133,6 @@ static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
for (i = 0; i < fs->nvqs; i++) {
fsvq = &fs->vqs[i];
- if (i == VQ_HIPRIO)
- drain_hiprio_queued_reqs(fsvq);
-
virtio_fs_drain_queue(fsvq);
}
}
@@ -253,14 +251,66 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work)
while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
kfree(req);
- fsvq->in_flight--;
+ dec_in_flight_req(fsvq);
}
} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
spin_unlock(&fsvq->lock);
}
-static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
+static void virtio_fs_request_dispatch_work(struct work_struct *work)
{
+ struct fuse_req *req;
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ dispatch_work.work);
+ struct fuse_conn *fc = fsvq->fud->fc;
+ int ret;
+
+ pr_debug("virtio-fs: worker %s called.\n", __func__);
+ while (1) {
+ spin_lock(&fsvq->lock);
+ req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req,
+ list);
+ if (!req) {
+ spin_unlock(&fsvq->lock);
+ break;
+ }
+
+ list_del_init(&req->list);
+ spin_unlock(&fsvq->lock);
+ fuse_request_end(fc, req);
+ }
+
+ /* Dispatch pending requests */
+ while (1) {
+ spin_lock(&fsvq->lock);
+ req = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct fuse_req, list);
+ if (!req) {
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+ list_del_init(&req->list);
+ spin_unlock(&fsvq->lock);
+
+ ret = virtio_fs_enqueue_req(fsvq, req, true);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+ req->out.h.error = ret;
+ spin_lock(&fsvq->lock);
+ dec_in_flight_req(fsvq);
+ spin_unlock(&fsvq->lock);
+ pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
+ ret);
+ fuse_request_end(fc, req);
+ }
+ }
}
static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
@@ -286,6 +336,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
list_del(&forget->list);
if (!fsvq->connected) {
+ dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
kfree(forget);
continue;
@@ -307,13 +358,13 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
} else {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
ret);
+ dec_in_flight_req(fsvq);
kfree(forget);
}
spin_unlock(&fsvq->lock);
return;
}
- fsvq->in_flight++;
notify = virtqueue_kick_prepare(vq);
spin_unlock(&fsvq->lock);
@@ -452,7 +503,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
fuse_request_end(fc, req);
spin_lock(&fsvq->lock);
- fsvq->in_flight--;
+ dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
}
}
@@ -502,6 +553,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
+ INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
virtio_fs_hiprio_dispatch_work);
spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
@@ -511,8 +563,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
spin_lock_init(&fs->vqs[i].lock);
INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
- virtio_fs_dummy_dispatch_work);
+ virtio_fs_request_dispatch_work);
INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
+ INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
"requests.%u", i - VQ_REQUEST);
callbacks[i] = virtio_fs_vq_done;
@@ -708,6 +761,7 @@ __releases(fiq->lock)
list_add_tail(&forget->list, &fsvq->queued_reqs);
schedule_delayed_work(&fsvq->dispatch_work,
msecs_to_jiffies(1));
+ inc_in_flight_req(fsvq);
} else {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
ret);
@@ -717,7 +771,7 @@ __releases(fiq->lock)
goto out;
}
- fsvq->in_flight++;
+ inc_in_flight_req(fsvq);
notify = virtqueue_kick_prepare(vq);
spin_unlock(&fsvq->lock);
@@ -819,7 +873,7 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg,
/* Add a request to a virtqueue and kick the device */
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
- struct fuse_req *req)
+ struct fuse_req *req, bool in_flight)
{
/* requests need at least 4 elements */
struct scatterlist *stack_sgs[6];
@@ -835,6 +889,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
unsigned int i;
int ret;
bool notify;
+ struct fuse_pqueue *fpq;
/* Does the sglist fit on the stack? */
total_sgs = sg_count_fuse_req(req);
@@ -889,7 +944,17 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
goto out;
}
- fsvq->in_flight++;
+ /* Request successfully sent. */
+ fpq = &fsvq->fud->pq;
+ spin_lock(&fpq->lock);
+ list_add_tail(&req->list, fpq->processing);
+ spin_unlock(&fpq->lock);
+ set_bit(FR_SENT, &req->flags);
+ /* matches barrier in request_wait_answer() */
+ smp_mb__after_atomic();
+
+ if (!in_flight)
+ inc_in_flight_req(fsvq);
notify = virtqueue_kick_prepare(vq);
spin_unlock(&fsvq->lock);
@@ -915,9 +980,8 @@ __releases(fiq->lock)
{
unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
struct virtio_fs *fs;
- struct fuse_conn *fc;
struct fuse_req *req;
- struct fuse_pqueue *fpq;
+ struct virtio_fs_vq *fsvq;
int ret;
WARN_ON(list_empty(&fiq->pending));
@@ -928,44 +992,36 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
fs = fiq->priv;
- fc = fs->vqs[queue_id].fud->fc;
pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
__func__, req->in.h.opcode, req->in.h.unique,
req->in.h.nodeid, req->in.h.len,
fuse_len_args(req->args->out_numargs, req->args->out_args));
- fpq = &fs->vqs[queue_id].fud->pq;
- spin_lock(&fpq->lock);
- if (!fpq->connected) {
- spin_unlock(&fpq->lock);
- req->out.h.error = -ENODEV;
- pr_err("virtio-fs: %s disconnected\n", __func__);
- fuse_request_end(fc, req);
- return;
- }
- list_add_tail(&req->list, fpq->processing);
- spin_unlock(&fpq->lock);
- set_bit(FR_SENT, &req->flags);
- /* matches barrier in request_wait_answer() */
- smp_mb__after_atomic();
-
-retry:
- ret = virtio_fs_enqueue_req(&fs->vqs[queue_id], req);
+ fsvq = &fs->vqs[queue_id];
+ ret = virtio_fs_enqueue_req(fsvq, req, false);
if (ret < 0) {
if (ret == -ENOMEM || ret == -ENOSPC) {
- /* Virtqueue full. Retry submission */
- /* TODO use completion instead of timeout */
- usleep_range(20, 30);
- goto retry;
+ /*
+ * Virtqueue full. Retry submission from worker
+ * context as we might be holding fc->bg_lock.
+ */
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->queued_reqs);
+ inc_in_flight_req(fsvq);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ spin_unlock(&fsvq->lock);
+ return;
}
req->out.h.error = ret;
pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
- spin_lock(&fpq->lock);
- clear_bit(FR_SENT, &req->flags);
- list_del_init(&req->list);
- spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+
+ /* Can't end request in submission context. Use a worker */
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->end_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work, 0);
+ spin_unlock(&fsvq->lock);
return;
}
}
@@ -992,6 +1048,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
.destroy = true,
.no_control = true,
.no_force_umount = true,
+ .no_mount_options = true,
};
mutex_lock(&virtio_fs_mutex);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 681b44682b0d..18daf494abab 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1540,17 +1540,23 @@ static int gfs2_init_fs_context(struct fs_context *fc)
{
struct gfs2_args *args;
- args = kzalloc(sizeof(*args), GFP_KERNEL);
+ args = kmalloc(sizeof(*args), GFP_KERNEL);
if (args == NULL)
return -ENOMEM;
- args->ar_quota = GFS2_QUOTA_DEFAULT;
- args->ar_data = GFS2_DATA_DEFAULT;
- args->ar_commit = 30;
- args->ar_statfs_quantum = 30;
- args->ar_quota_quantum = 60;
- args->ar_errors = GFS2_ERRORS_DEFAULT;
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct gfs2_sbd *sdp = fc->root->d_sb->s_fs_info;
+ *args = sdp->sd_args;
+ } else {
+ memset(args, 0, sizeof(*args));
+ args->ar_quota = GFS2_QUOTA_DEFAULT;
+ args->ar_data = GFS2_DATA_DEFAULT;
+ args->ar_commit = 30;
+ args->ar_statfs_quantum = 30;
+ args->ar_quota_quantum = 60;
+ args->ar_errors = GFS2_ERRORS_DEFAULT;
+ }
fc->fs_private = args;
fc->ops = &gfs2_context_ops;
return 0;
@@ -1600,6 +1606,7 @@ static int gfs2_meta_get_tree(struct fs_context *fc)
}
static const struct fs_context_operations gfs2_meta_context_ops = {
+ .free = gfs2_fc_free,
.get_tree = gfs2_meta_get_tree,
};
diff --git a/fs/io_uring.c b/fs/io_uring.c
index aa8ac557493c..f9a38998f2fc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -197,6 +197,7 @@ struct io_ring_ctx {
unsigned sq_entries;
unsigned sq_mask;
unsigned sq_thread_idle;
+ unsigned cached_sq_dropped;
struct io_uring_sqe *sq_sqes;
struct list_head defer_list;
@@ -212,6 +213,7 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
+ atomic_t cached_cq_overflow;
unsigned cq_entries;
unsigned cq_mask;
struct wait_queue_head cq_wait;
@@ -322,6 +324,8 @@ struct io_kiocb {
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
#define REQ_F_TIMEOUT 1024 /* timeout request */
+#define REQ_F_ISREG 2048 /* regular file */
+#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
u64 user_data;
u32 result;
u32 sequence;
@@ -415,27 +419,28 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
return ctx;
}
+static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
+ + atomic_read(&ctx->cached_cq_overflow);
+}
+
static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
- /* timeout requests always honor sequence */
- if (!(req->flags & REQ_F_TIMEOUT) &&
- (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
+ if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
return false;
- return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped;
+ return __io_sequence_defer(ctx, req);
}
-static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx,
- struct list_head *list)
+static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
- if (list_empty(list))
- return NULL;
-
- req = list_first_entry(list, struct io_kiocb, list);
- if (!io_sequence_defer(ctx, req)) {
+ req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
+ if (req && !io_sequence_defer(ctx, req)) {
list_del_init(&req->list);
return req;
}
@@ -443,14 +448,17 @@ static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx,
return NULL;
}
-static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
-{
- return __io_get_deferred_req(ctx, &ctx->defer_list);
-}
-
static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
{
- return __io_get_deferred_req(ctx, &ctx->timeout_list);
+ struct io_kiocb *req;
+
+ req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
+ if (req && !__io_sequence_defer(ctx, req)) {
+ list_del_init(&req->list);
+ return req;
+ }
+
+ return NULL;
}
static void __io_commit_cqring(struct io_ring_ctx *ctx)
@@ -562,9 +570,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, 0);
} else {
- unsigned overflow = READ_ONCE(ctx->rings->cq_overflow);
-
- WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1);
+ WRITE_ONCE(ctx->rings->cq_overflow,
+ atomic_inc_return(&ctx->cached_cq_overflow));
}
}
@@ -591,14 +598,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
io_cqring_ev_posted(ctx);
}
-static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
-{
- percpu_ref_put_many(&ctx->refs, refs);
-
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
-}
-
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
@@ -646,7 +645,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req->result = 0;
return req;
out:
- io_ring_drop_ctx_refs(ctx, 1);
+ percpu_ref_put(&ctx->refs);
return NULL;
}
@@ -654,7 +653,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
{
if (*nr) {
kmem_cache_free_bulk(req_cachep, *nr, reqs);
- io_ring_drop_ctx_refs(ctx, *nr);
+ percpu_ref_put_many(&ctx->refs, *nr);
*nr = 0;
}
}
@@ -663,7 +662,7 @@ static void __io_free_req(struct io_kiocb *req)
{
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
- io_ring_drop_ctx_refs(req->ctx, 1);
+ percpu_ref_put(&req->ctx->refs);
kmem_cache_free(req_cachep, req);
}
@@ -738,6 +737,14 @@ static unsigned io_cqring_events(struct io_rings *rings)
return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
}
+static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
+{
+ struct io_rings *rings = ctx->rings;
+
+ /* make sure SQ entry isn't read before tail */
+ return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -867,19 +874,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
+static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
{
- int iters, ret = 0;
+ int iters = 0, ret = 0;
- /*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
-
- iters = 0;
do {
int tmin = 0;
@@ -915,30 +914,45 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
ret = 0;
} while (min && !*nr_events && !need_resched());
+ return ret;
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
+{
+ int ret;
+
+ /*
+ * We disallow the app entering submit/complete with polling, but we
+ * still need to lock the ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_iopoll_check(ctx, nr_events, min);
mutex_unlock(&ctx->uring_lock);
return ret;
}
-static void kiocb_end_write(struct kiocb *kiocb)
+static void kiocb_end_write(struct io_kiocb *req)
{
- if (kiocb->ki_flags & IOCB_WRITE) {
- struct inode *inode = file_inode(kiocb->ki_filp);
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (req->flags & REQ_F_ISREG) {
+ struct inode *inode = file_inode(req->file);
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
- if (S_ISREG(inode->i_mode))
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
- file_end_write(kiocb->ki_filp);
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
}
+ file_end_write(req->file);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
- kiocb_end_write(kiocb);
+ if (kiocb->ki_flags & IOCB_WRITE)
+ kiocb_end_write(req);
if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK;
@@ -950,7 +964,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
- kiocb_end_write(kiocb);
+ if (kiocb->ki_flags & IOCB_WRITE)
+ kiocb_end_write(req);
if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK;
@@ -1064,8 +1079,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
if (!req->file)
return -EBADF;
- if (force_nonblock && !io_file_supports_async(req->file))
- force_nonblock = false;
+ if (S_ISREG(file_inode(req->file)->i_mode))
+ req->flags |= REQ_F_ISREG;
+
+ /*
+ * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
+ * we know to async punt it even if it was opened O_NONBLOCK
+ */
+ if (force_nonblock && !io_file_supports_async(req->file)) {
+ req->flags |= REQ_F_MUST_PUNT;
+ return -EAGAIN;
+ }
kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
@@ -1086,7 +1110,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
return ret;
/* don't allow async punt if RWF_NOWAIT was requested */
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+ (req->file->f_flags & O_NONBLOCK))
req->flags |= REQ_F_NOWAIT;
if (force_nonblock)
@@ -1099,6 +1124,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
+ req->result = 0;
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -1387,7 +1413,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
* need async punt anyway, so it's more efficient to do it
* here.
*/
- if (force_nonblock && ret2 > 0 && ret2 < read_size)
+ if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
+ (req->flags & REQ_F_ISREG) &&
+ ret2 > 0 && ret2 < read_size)
ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
@@ -1452,7 +1480,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
* released so that it doesn't complain about the held lock when
* we return to userspace.
*/
- if (S_ISREG(file_inode(file)->i_mode)) {
+ if (req->flags & REQ_F_ISREG) {
__sb_start_write(file_inode(file)->i_sb,
SB_FREEZE_WRITE, true);
__sb_writers_release(file_inode(file)->i_sb,
@@ -1867,7 +1895,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
struct io_ring_ctx *ctx;
- struct io_kiocb *req;
+ struct io_kiocb *req, *prev;
unsigned long flags;
req = container_of(timer, struct io_kiocb, timeout.timer);
@@ -1875,6 +1903,15 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
atomic_inc(&ctx->cq_timeouts);
spin_lock_irqsave(&ctx->completion_lock, flags);
+ /*
+ * Adjust the reqs sequence before the current one because it
+ * will consume a slot in the cq_ring and the the cq_tail pointer
+ * will be increased, otherwise other timeout reqs may return in
+ * advance without waiting for enough wait_nr.
+ */
+ prev = req;
+ list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
+ prev->sequence++;
list_del(&req->list);
io_cqring_fill_event(ctx, req->user_data, -ETIME);
@@ -1889,18 +1926,19 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- unsigned count, req_dist, tail_index;
+ unsigned count;
struct io_ring_ctx *ctx = req->ctx;
struct list_head *entry;
- struct timespec ts;
+ struct timespec64 ts;
+ unsigned span = 0;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
sqe->len != 1)
return -EINVAL;
- if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr,
- sizeof(ts)))
+
+ if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
/*
@@ -1912,29 +1950,52 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
count = 1;
req->sequence = ctx->cached_sq_head + count - 1;
+ /* reuse it to store the count */
+ req->submit.sequence = count;
req->flags |= REQ_F_TIMEOUT;
/*
* Insertion sort, ensuring the first entry in the list is always
* the one we need first.
*/
- tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped;
- req_dist = req->sequence - tail_index;
spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned dist;
+ unsigned nxt_sq_head;
+ long long tmp, tmp_nxt;
- dist = nxt->sequence - tail_index;
- if (req_dist >= dist)
+ /*
+ * Since cached_sq_head + count - 1 can overflow, use type long
+ * long to store it.
+ */
+ tmp = (long long)ctx->cached_sq_head + count - 1;
+ nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
+ tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
+
+ /*
+ * cached_sq_head may overflow, and it will never overflow twice
+ * once there is some timeout req still be valid.
+ */
+ if (ctx->cached_sq_head < nxt_sq_head)
+ tmp += UINT_MAX;
+
+ if (tmp > tmp_nxt)
break;
+
+ /*
+ * Sequence of reqs after the insert one and itself should
+ * be adjusted because each timeout req consumes a slot.
+ */
+ span++;
+ nxt->sequence++;
}
+ req->sequence -= span;
list_add(&req->list, entry);
spin_unlock_irq(&ctx->completion_lock);
hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
req->timeout.timer.function = io_timeout_fn;
- hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts),
+ hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
HRTIMER_MODE_REL);
return 0;
}
@@ -2267,12 +2328,18 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
}
static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s, bool force_nonblock)
+ struct sqe_submit *s)
{
int ret;
- ret = __io_submit_sqe(ctx, req, s, force_nonblock);
- if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ ret = __io_submit_sqe(ctx, req, s, true);
+
+ /*
+ * We async punt it if the file wasn't marked NOWAIT, or if the file
+ * doesn't support non-blocking read/write attempts
+ */
+ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
+ (req->flags & REQ_F_MUST_PUNT))) {
struct io_uring_sqe *sqe_copy;
sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
@@ -2312,7 +2379,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s, bool force_nonblock)
+ struct sqe_submit *s)
{
int ret;
@@ -2325,18 +2392,17 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return 0;
}
- return __io_queue_sqe(ctx, req, s, force_nonblock);
+ return __io_queue_sqe(ctx, req, s);
}
static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s, struct io_kiocb *shadow,
- bool force_nonblock)
+ struct sqe_submit *s, struct io_kiocb *shadow)
{
int ret;
int need_submit = false;
if (!shadow)
- return io_queue_sqe(ctx, req, s, force_nonblock);
+ return io_queue_sqe(ctx, req, s);
/*
* Mark the first IO in link list as DRAIN, let all the following
@@ -2348,6 +2414,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (ret) {
if (ret != -EIOCBQUEUED) {
io_free_req(req);
+ __io_free_req(shadow);
io_cqring_add_event(ctx, s->sqe->user_data, ret);
return 0;
}
@@ -2365,7 +2432,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
spin_unlock_irq(&ctx->completion_lock);
if (need_submit)
- return __io_queue_sqe(ctx, req, s, force_nonblock);
+ return __io_queue_sqe(ctx, req, s);
return 0;
}
@@ -2373,8 +2440,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
- struct io_submit_state *state, struct io_kiocb **link,
- bool force_nonblock)
+ struct io_submit_state *state, struct io_kiocb **link)
{
struct io_uring_sqe *sqe_copy;
struct io_kiocb *req;
@@ -2401,6 +2467,8 @@ err:
return;
}
+ req->user_data = s->sqe->user_data;
+
/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
@@ -2427,7 +2495,7 @@ err:
INIT_LIST_HEAD(&req->link_list);
*link = req;
} else {
- io_queue_sqe(ctx, req, s, force_nonblock);
+ io_queue_sqe(ctx, req, s);
}
}
@@ -2507,12 +2575,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
/* drop invalid entries */
ctx->cached_sq_head++;
- rings->sq_dropped++;
+ ctx->cached_sq_dropped++;
+ WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
return false;
}
-static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
- unsigned int nr, bool has_user, bool mm_fault)
+static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
+ bool has_user, bool mm_fault)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
@@ -2526,19 +2595,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
}
for (i = 0; i < nr; i++) {
+ struct sqe_submit s;
+
+ if (!io_get_sqring(ctx, &s))
+ break;
+
/*
* If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link.
*/
if (!prev_was_link && link) {
- io_queue_link_head(ctx, link, &link->submit, shadow_req,
- true);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req);
link = NULL;
shadow_req = NULL;
}
- prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
+ prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
- if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) {
+ if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
if (!shadow_req) {
shadow_req = io_get_req(ctx, NULL);
if (unlikely(!shadow_req))
@@ -2546,24 +2619,24 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
refcount_dec(&shadow_req->refs);
}
- shadow_req->sequence = sqes[i].sequence;
+ shadow_req->sequence = s.sequence;
}
out:
if (unlikely(mm_fault)) {
- io_cqring_add_event(ctx, sqes[i].sqe->user_data,
+ io_cqring_add_event(ctx, s.sqe->user_data,
-EFAULT);
} else {
- sqes[i].has_user = has_user;
- sqes[i].needs_lock = true;
- sqes[i].needs_fixed_file = true;
- io_submit_sqe(ctx, &sqes[i], statep, &link, true);
+ s.has_user = has_user;
+ s.needs_lock = true;
+ s.needs_fixed_file = true;
+ io_submit_sqe(ctx, &s, statep, &link);
submitted++;
}
}
if (link)
- io_queue_link_head(ctx, link, &link->submit, shadow_req, true);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req);
if (statep)
io_submit_state_end(&state);
@@ -2572,7 +2645,6 @@ out:
static int io_sq_thread(void *data)
{
- struct sqe_submit sqes[IO_IOPOLL_BATCH];
struct io_ring_ctx *ctx = data;
struct mm_struct *cur_mm = NULL;
mm_segment_t old_fs;
@@ -2587,14 +2659,27 @@ static int io_sq_thread(void *data)
timeout = inflight = 0;
while (!kthread_should_park()) {
- bool all_fixed, mm_fault = false;
- int i;
+ bool mm_fault = false;
+ unsigned int to_submit;
if (inflight) {
unsigned nr_events = 0;
if (ctx->flags & IORING_SETUP_IOPOLL) {
- io_iopoll_check(ctx, &nr_events, 0);
+ /*
+ * inflight is the count of the maximum possible
+ * entries we submitted, but it can be smaller
+ * if we dropped some of them. If we don't have
+ * poll entries available, then we know that we
+ * have nothing left to poll for. Reset the
+ * inflight count to zero in that case.
+ */
+ mutex_lock(&ctx->uring_lock);
+ if (!list_empty(&ctx->poll_list))
+ __io_iopoll_check(ctx, &nr_events, 0);
+ else
+ inflight = 0;
+ mutex_unlock(&ctx->uring_lock);
} else {
/*
* Normal IO, just pretend everything completed.
@@ -2608,7 +2693,8 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle;
}
- if (!io_get_sqring(ctx, &sqes[0])) {
+ to_submit = io_sqring_entries(ctx);
+ if (!to_submit) {
/*
* We're polling. If we're within the defined idle
* period, then let us spin without work before going
@@ -2639,7 +2725,8 @@ static int io_sq_thread(void *data)
/* make sure to read SQ tail after writing flags */
smp_mb();
- if (!io_get_sqring(ctx, &sqes[0])) {
+ to_submit = io_sqring_entries(ctx);
+ if (!to_submit) {
if (kthread_should_park()) {
finish_wait(&ctx->sqo_wait, &wait);
break;
@@ -2657,19 +2744,8 @@ static int io_sq_thread(void *data)
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
}
- i = 0;
- all_fixed = true;
- do {
- if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
- all_fixed = false;
-
- i++;
- if (i == ARRAY_SIZE(sqes))
- break;
- } while (io_get_sqring(ctx, &sqes[i]));
-
/* Unless all new commands are FIXED regions, grab mm */
- if (!all_fixed && !cur_mm) {
+ if (!cur_mm) {
mm_fault = !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) {
use_mm(ctx->sqo_mm);
@@ -2677,8 +2753,9 @@ static int io_sq_thread(void *data)
}
}
- inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
- mm_fault);
+ to_submit = min(to_submit, ctx->sq_entries);
+ inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
+ mm_fault);
/* Commit SQ ring head once we've consumed all SQEs */
io_commit_sqring(ctx);
@@ -2695,8 +2772,7 @@ static int io_sq_thread(void *data)
return 0;
}
-static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
- bool block_for_last)
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
@@ -2710,7 +2786,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
}
for (i = 0; i < to_submit; i++) {
- bool force_nonblock = true;
struct sqe_submit s;
if (!io_get_sqring(ctx, &s))
@@ -2721,8 +2796,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
* that's the end of the chain. Submit the previous link.
*/
if (!prev_was_link && link) {
- io_queue_link_head(ctx, link, &link->submit, shadow_req,
- force_nonblock);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req);
link = NULL;
shadow_req = NULL;
}
@@ -2744,27 +2818,16 @@ out:
s.needs_lock = false;
s.needs_fixed_file = false;
submit++;
-
- /*
- * The caller will block for events after submit, submit the
- * last IO non-blocking. This is either the only IO it's
- * submitting, or it already submitted the previous ones. This
- * improves performance by avoiding an async punt that we don't
- * need to do.
- */
- if (block_for_last && submit == to_submit)
- force_nonblock = false;
-
- io_submit_sqe(ctx, &s, statep, &link, force_nonblock);
+ io_submit_sqe(ctx, &s, statep, &link);
}
- io_commit_sqring(ctx);
if (link)
- io_queue_link_head(ctx, link, &link->submit, shadow_req,
- block_for_last);
+ io_queue_link_head(ctx, link, &link->submit, shadow_req);
if (statep)
io_submit_state_end(statep);
+ io_commit_sqring(ctx);
+
return submit;
}
@@ -2920,8 +2983,12 @@ static void io_finish_async(struct io_ring_ctx *ctx)
static void io_destruct_skb(struct sk_buff *skb)
{
struct io_ring_ctx *ctx = skb->sk->sk_user_data;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
+ if (ctx->sqo_wq[i])
+ flush_workqueue(ctx->sqo_wq[i]);
- io_finish_async(ctx);
unix_destruct_scm(skb);
}
@@ -3601,21 +3668,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
- bool block_for_last = false;
-
to_submit = min(to_submit, ctx->sq_entries);
- /*
- * Allow last submission to block in a series, IFF the caller
- * asked to wait for events and we don't currently have
- * enough. This potentially avoids an async punt.
- */
- if (to_submit == min_complete &&
- io_cqring_events(ctx->rings) < min_complete)
- block_for_last = true;
-
mutex_lock(&ctx->uring_lock);
- submitted = io_ring_submit(ctx, to_submit, block_for_last);
+ submitted = io_ring_submit(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
}
if (flags & IORING_ENTER_GETEVENTS) {
@@ -3630,7 +3686,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
}
- io_ring_drop_ctx_refs(ctx, 1);
+ percpu_ref_put(&ctx->refs);
out_fput:
fdput(f);
return submitted ? submitted : ret;
@@ -3774,10 +3830,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
if (ret)
goto err;
- ret = io_uring_get_fd(ctx);
- if (ret < 0)
- goto err;
-
memset(&p->sq_off, 0, sizeof(p->sq_off));
p->sq_off.head = offsetof(struct io_rings, sq.head);
p->sq_off.tail = offsetof(struct io_rings, sq.tail);
@@ -3795,6 +3847,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
p->cq_off.cqes = offsetof(struct io_rings, cqes);
+ /*
+ * Install ring fd as the very last thing, so we don't risk someone
+ * having closed it before we finish setup
+ */
+ ret = io_uring_get_fd(ctx);
+ if (ret < 0)
+ goto err;
+
p->features = IORING_FEAT_SINGLE_MMAP;
return ret;
err:
diff --git a/fs/libfs.c b/fs/libfs.c
index c9b2850c0f7c..1463b038ffc4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,58 +89,45 @@ int dcache_dir_close(struct inode *inode, struct file *file)
EXPORT_SYMBOL(dcache_dir_close);
/* parent is locked at least shared */
-static struct dentry *next_positive(struct dentry *parent,
- struct list_head *from,
- int count)
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and returned to caller.
+ * If no such element exists, NULL is returned.
+ */
+static struct dentry *scan_positives(struct dentry *cursor,
+ struct list_head *p,
+ loff_t count,
+ struct dentry *last)
{
- unsigned *seq = &parent->d_inode->i_dir_seq, n;
- struct dentry *res;
- struct list_head *p;
- bool skipped;
- int i;
+ struct dentry *dentry = cursor->d_parent, *found = NULL;
-retry:
- i = count;
- skipped = false;
- n = smp_load_acquire(seq) & ~1;
- res = NULL;
- rcu_read_lock();
- for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+ spin_lock(&dentry->d_lock);
+ while ((p = p->next) != &dentry->d_subdirs) {
struct dentry *d = list_entry(p, struct dentry, d_child);
- if (!simple_positive(d)) {
- skipped = true;
- } else if (!--i) {
- res = d;
- break;
+ // we must at least skip cursors, to avoid livelocks
+ if (d->d_flags & DCACHE_DENTRY_CURSOR)
+ continue;
+ if (simple_positive(d) && !--count) {
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(d))
+ found = dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ if (likely(found))
+ break;
+ count = 1;
+ }
+ if (need_resched()) {
+ list_move(&cursor->d_child, p);
+ p = &cursor->d_child;
+ spin_unlock(&dentry->d_lock);
+ cond_resched();
+ spin_lock(&dentry->d_lock);
}
}
- rcu_read_unlock();
- if (skipped) {
- smp_rmb();
- if (unlikely(*seq != n))
- goto retry;
- }
- return res;
-}
-
-static void move_cursor(struct dentry *cursor, struct list_head *after)
-{
- struct dentry *parent = cursor->d_parent;
- unsigned n, *seq = &parent->d_inode->i_dir_seq;
- spin_lock(&parent->d_lock);
- for (;;) {
- n = *seq;
- if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
- break;
- cpu_relax();
- }
- __list_del(cursor->d_child.prev, cursor->d_child.next);
- if (after)
- list_add(&cursor->d_child, after);
- else
- list_add_tail(&cursor->d_child, &parent->d_subdirs);
- smp_store_release(seq, n + 2);
- spin_unlock(&parent->d_lock);
+ spin_unlock(&dentry->d_lock);
+ dput(last);
+ return found;
}
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
@@ -158,17 +145,25 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
if (offset != file->f_pos) {
+ struct dentry *cursor = file->private_data;
+ struct dentry *to = NULL;
+
+ inode_lock_shared(dentry->d_inode);
+
+ if (offset > 2)
+ to = scan_positives(cursor, &dentry->d_subdirs,
+ offset - 2, NULL);
+ spin_lock(&dentry->d_lock);
+ if (to)
+ list_move(&cursor->d_child, &to->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(to);
+
file->f_pos = offset;
- if (file->f_pos >= 2) {
- struct dentry *cursor = file->private_data;
- struct dentry *to;
- loff_t n = file->f_pos - 2;
-
- inode_lock_shared(dentry->d_inode);
- to = next_positive(dentry, &dentry->d_subdirs, n);
- move_cursor(cursor, to ? &to->d_child : NULL);
- inode_unlock_shared(dentry->d_inode);
- }
+
+ inode_unlock_shared(dentry->d_inode);
}
return offset;
}
@@ -190,25 +185,35 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *p = &cursor->d_child;
- struct dentry *next;
- bool moved = false;
+ struct list_head *anchor = &dentry->d_subdirs;
+ struct dentry *next = NULL;
+ struct list_head *p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
- p = &dentry->d_subdirs;
- while ((next = next_positive(dentry, p, 1)) != NULL) {
+ p = anchor;
+ else if (!list_empty(&cursor->d_child))
+ p = &cursor->d_child;
+ else
+ return 0;
+
+ while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
d_inode(next)->i_ino, dt_type(d_inode(next))))
break;
- moved = true;
- p = &next->d_child;
ctx->pos++;
+ p = &next->d_child;
}
- if (moved)
- move_cursor(cursor, p);
+ spin_lock(&dentry->d_lock);
+ if (next)
+ list_move_tail(&cursor->d_child, &next->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(next);
+
return 0;
}
EXPORT_SYMBOL(dcache_readdir);
@@ -468,8 +473,7 @@ EXPORT_SYMBOL(simple_write_begin);
/**
* simple_write_end - .write_end helper for non-block-device FSes
- * @available: See .write_end of address_space_operations
- * @file: "
+ * @file: See .write_end of address_space_operations
* @mapping: "
* @pos: "
* @len: "
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 071b90a45933..af549d70ec50 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -53,6 +53,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
return false;
}
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, 0))
+ return delegation;
+ return NULL;
+}
+
static int
nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
{
@@ -1181,7 +1191,7 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
if (delegation != NULL &&
nfs4_stateid_match_other(dst, &delegation->stateid)) {
dst->seqid = delegation->stateid.seqid;
- return ret;
+ ret = true;
}
rcu_read_unlock();
out:
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9eb87ae4c982..8b14d441e699 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -68,6 +68,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 222d7115db71..040a50fd9bf3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -64,13 +64,6 @@
static struct kmem_cache *nfs_direct_cachep;
-/*
- * This represents a set of asynchronous requests that we're waiting on
- */
-struct nfs_direct_mirror {
- ssize_t count;
-};
-
struct nfs_direct_req {
struct kref kref; /* release manager */
@@ -84,9 +77,6 @@ struct nfs_direct_req {
atomic_t io_count; /* i/os we're waiting for */
spinlock_t lock; /* protect completion state */
- struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
- int mirror_count;
-
loff_t io_start; /* Start offset for I/O */
ssize_t count, /* bytes actually processed */
max_count, /* max expected count */
@@ -123,32 +113,42 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
}
static void
-nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr,
+ ssize_t dreq_len)
{
- int i;
- ssize_t count;
+ if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
+ test_bit(NFS_IOHDR_EOF, &hdr->flags)))
+ return;
+ if (dreq->max_count >= dreq_len) {
+ dreq->max_count = dreq_len;
+ if (dreq->count > dreq_len)
+ dreq->count = dreq_len;
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ dreq->error = hdr->error;
+ else /* Clear outstanding error if this is EOF */
+ dreq->error = 0;
+ }
+}
- WARN_ON_ONCE(dreq->count >= dreq->max_count);
+static void
+nfs_direct_count_bytes(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr)
+{
+ loff_t hdr_end = hdr->io_start + hdr->good_bytes;
+ ssize_t dreq_len = 0;
- if (dreq->mirror_count == 1) {
- dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
- dreq->count += hdr->good_bytes;
- } else {
- /* mirrored writes */
- count = dreq->mirrors[hdr->pgio_mirror_idx].count;
- if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
- count = hdr->io_start + hdr->good_bytes - dreq->io_start;
- dreq->mirrors[hdr->pgio_mirror_idx].count = count;
- }
- /* update the dreq->count by finding the minimum agreed count from all
- * mirrors */
- count = dreq->mirrors[0].count;
+ if (hdr_end > dreq->io_start)
+ dreq_len = hdr_end - dreq->io_start;
- for (i = 1; i < dreq->mirror_count; i++)
- count = min(count, dreq->mirrors[i].count);
+ nfs_direct_handle_truncated(dreq, hdr, dreq_len);
- dreq->count = count;
- }
+ if (dreq_len > dreq->max_count)
+ dreq_len = dreq->max_count;
+
+ if (dreq->count < dreq_len)
+ dreq->count = dreq_len;
}
/*
@@ -293,18 +293,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
cinfo->completion_ops = &nfs_direct_commit_completion_ops;
}
-static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
- struct nfs_pageio_descriptor *pgio,
- struct nfs_page *req)
-{
- int mirror_count = 1;
-
- if (pgio->pg_ops->pg_get_mirror_count)
- mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
-
- dreq->mirror_count = mirror_count;
-}
-
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
struct nfs_direct_req *dreq;
@@ -319,7 +307,6 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
- dreq->mirror_count = 1;
spin_lock_init(&dreq->lock);
return dreq;
@@ -402,20 +389,12 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
struct nfs_direct_req *dreq = hdr->dreq;
spin_lock(&dreq->lock);
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
spin_unlock(&dreq->lock);
goto out_put;
}
- if (hdr->good_bytes != 0)
- nfs_direct_good_bytes(dreq, hdr);
-
- if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
- dreq->error = 0;
-
+ nfs_direct_count_bytes(dreq, hdr);
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
@@ -646,29 +625,22 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
LIST_HEAD(reqs);
struct nfs_commit_info cinfo;
LIST_HEAD(failed);
- int i;
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ dreq->max_count = 0;
+ list_for_each_entry(req, &reqs, wb_list)
+ dreq->max_count += req->wb_bytes;
dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
- for (i = 0; i < dreq->mirror_count; i++)
- dreq->mirrors[i].count = 0;
get_dreq(dreq);
nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
- req = nfs_list_entry(reqs.next);
- nfs_direct_setup_mirroring(dreq, &desc, req);
- if (desc.pg_error < 0) {
- list_splice_init(&reqs, &failed);
- goto out_failed;
- }
-
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
/* Bump the transmission count */
req->wb_nio++;
@@ -686,7 +658,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
}
nfs_pageio_complete(&desc);
-out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -791,17 +762,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
-
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
spin_unlock(&dreq->lock);
goto out_put;
}
+ nfs_direct_count_bytes(dreq, hdr);
if (hdr->good_bytes != 0) {
- nfs_direct_good_bytes(dreq, hdr);
if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
request_commit = true;
@@ -923,7 +890,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
break;
}
- nfs_direct_setup_mirroring(dreq, &desc, req);
if (desc.pg_error < 0) {
nfs_free_request(req);
result = desc.pg_error;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 11eafcfc490b..caacf5e7f5e1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1440,8 +1440,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
return 0;
if ((delegation->type & fmode) != fmode)
return 0;
- if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
- return 0;
switch (claim) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_FH:
@@ -1810,7 +1808,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo
static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
{
struct nfs4_state *state = opendata->state;
- struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
@@ -1827,7 +1824,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
}
spin_unlock(&state->owner->so_lock);
rcu_read_lock();
- delegation = rcu_dereference(nfsi->delegation);
+ delegation = nfs4_get_valid_delegation(state->inode);
if (!can_open_delegated(delegation, fmode, claim)) {
rcu_read_unlock();
break;
@@ -2371,7 +2368,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
data->o_arg.open_flags, claim))
goto out_no_action;
rcu_read_lock();
- delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
+ delegation = nfs4_get_valid_delegation(data->state->inode);
if (can_open_delegated(delegation, data->o_arg.fmode, claim))
goto unlock_no_action;
rcu_read_unlock();
@@ -6106,6 +6103,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
status = nfs4_call_sync_custom(&task_setup_data);
if (setclientid.sc_cred) {
+ kfree(clp->cl_acceptor);
clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
put_rpccred(setclientid.sc_cred);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 85ca49549b39..52cab65f91cf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -786,7 +786,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *head;
- atomic_long_dec(&nfsi->nrequests);
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
head = req->wb_head;
@@ -799,8 +798,10 @@ static void nfs_inode_remove_request(struct nfs_page *req)
spin_unlock(&mapping->private_lock);
}
- if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
nfs_release_request(req);
+ atomic_long_dec(&nfsi->nrequests);
+ }
}
static void
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8de1c9d644f6..9cd0a6815933 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2049,7 +2049,8 @@ out_write_size:
inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2146,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2236,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2e982db3e1ae..9876db52913a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1230,6 +1230,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
if (IS_ERR(transfer_to[USRQUOTA])) {
status = PTR_ERR(transfer_to[USRQUOTA]);
+ transfer_to[USRQUOTA] = NULL;
goto bail_unlock;
}
}
@@ -1239,6 +1240,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
if (IS_ERR(transfer_to[GRPQUOTA])) {
status = PTR_ERR(transfer_to[GRPQUOTA]);
+ transfer_to[GRPQUOTA] = NULL;
goto bail_unlock;
}
}
@@ -2096,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
return 0;
}
-static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
- struct file *file,
- loff_t pos, size_t count,
- int *meta_level)
+static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int overwrite_io,
+ int write_sem,
+ int wait)
{
- int ret;
- struct buffer_head *di_bh = NULL;
- u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
- u32 clusters =
- ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ int ret = 0;
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, NULL, meta_level);
+ else
+ ret = ocfs2_try_inode_lock(inode,
+ overwrite_io ? NULL : di_bh, meta_level);
+ if (ret < 0)
goto out;
+
+ if (wait) {
+ if (write_sem)
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ if (write_sem)
+ ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (!ret) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
}
- *meta_level = 1;
+ return ret;
- ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
- if (ret)
- mlog_errno(ret);
+out_unlock:
+ brelse(*di_bh);
+ ocfs2_inode_unlock(inode, meta_level);
out:
- brelse(di_bh);
return ret;
}
+static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int write_sem)
+{
+ if (write_sem)
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(*di_bh);
+ *di_bh = NULL;
+
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
+}
+
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos, size_t count, int wait)
{
int ret = 0, meta_level = 0, overwrite_io = 0;
+ int write_sem = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
struct buffer_head *di_bh = NULL;
+ u32 cpos;
+ u32 clusters;
/*
* We start with a read level meta lock and only jump to an ex
* if we need to make modifications here.
*/
for(;;) {
- if (wait)
- ret = ocfs2_inode_lock(inode, NULL, meta_level);
- else
- ret = ocfs2_try_inode_lock(inode,
- overwrite_io ? NULL : &di_bh, meta_level);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ overwrite_io,
+ write_sem,
+ wait);
if (ret < 0) {
- meta_level = -1;
if (ret != -EAGAIN)
mlog_errno(ret);
goto out;
@@ -2154,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
*/
if (!wait && !overwrite_io) {
overwrite_io = 1;
- if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
- ret = -EAGAIN;
- goto out_unlock;
- }
ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
- brelse(di_bh);
- di_bh = NULL;
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0) {
if (ret != -EAGAIN)
mlog_errno(ret);
@@ -2181,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* set inode->i_size at the end of a write. */
if (should_remove_suid(dentry)) {
if (meta_level == 0) {
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
meta_level = 1;
continue;
}
@@ -2195,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
- ocfs2_inode_unlock(inode, meta_level);
- meta_level = -1;
-
- ret = ocfs2_prepare_inode_for_refcount(inode,
- file,
- pos,
- count,
- &meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ overwrite_io,
+ 1,
+ wait);
+ write_sem = 1;
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
}
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out_unlock;
}
@@ -2217,10 +2265,10 @@ out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
pos, count, wait);
- brelse(di_bh);
-
- if (meta_level >= 0)
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
out:
return ret;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d6f7b299eb23..efeea208fdeb 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
if (inode_alloc)
inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 930e3d388579..699a560efbb0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -217,7 +217,8 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(osb->ocfs2_wq);
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 158e5af767fd..720e9f94957e 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -377,7 +377,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(osb->ocfs2_wq);
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 90c830e3758e..d8507972ee13 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
return loc->xl_ops->xlo_check_space(loc, xi);
}
-static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
-{
- loc->xl_ops->xlo_add_entry(loc, name_hash);
- loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
- /*
- * We can't leave the new entry's xe_name_offset at zero or
- * add_namevalue() will go nuts. We set it to the size of our
- * storage so that it can never be less than any other entry.
- */
- loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
-}
-
static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
struct ocfs2_xattr_info *xi)
{
@@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
if (rc)
goto out;
- if (loc->xl_entry) {
- if (ocfs2_xa_can_reuse_entry(loc, xi)) {
- orig_value_size = loc->xl_entry->xe_value_size;
- rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
- if (rc)
- goto out;
- goto alloc_value;
- }
+ if (!loc->xl_entry) {
+ rc = -EINVAL;
+ goto out;
+ }
- if (!ocfs2_xattr_is_local(loc->xl_entry)) {
- orig_clusters = ocfs2_xa_value_clusters(loc);
- rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
- if (rc) {
- mlog_errno(rc);
- ocfs2_xa_cleanup_value_truncate(loc,
- "overwriting",
- orig_clusters);
- goto out;
- }
+ if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+ orig_value_size = loc->xl_entry->xe_value_size;
+ rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+ if (rc)
+ goto out;
+ goto alloc_value;
+ }
+
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ ocfs2_xa_cleanup_value_truncate(loc,
+ "overwriting",
+ orig_clusters);
+ goto out;
}
- ocfs2_xa_wipe_namevalue(loc);
- } else
- ocfs2_xa_add_entry(loc, name_hash);
+ }
+ ocfs2_xa_wipe_namevalue(loc);
/*
* If we get here, we have a blank entry. Fill it. We grow our
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index ac9247371871..8c1f1bb1a5ce 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -132,9 +132,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
show_val_kb(m, "ShmemPmdMapped: ",
global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
- show_val_kb(m, "FileHugePages: ",
+ show_val_kb(m, "FileHugePages: ",
global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
- show_val_kb(m, "FilePmdMapped: ",
+ show_val_kb(m, "FilePmdMapped: ",
global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
#endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 544d1ee15aee..7c952ee732e6 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -42,10 +42,12 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
+
if (!ppage || PageSlab(ppage) || page_has_type(ppage))
pcount = 0;
else
@@ -216,10 +218,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
if (put_user(stable_page_flags(ppage), out)) {
ret = -EFAULT;
@@ -261,10 +264,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
if (ppage)
ino = page_cgroup_ino(ppage);
diff --git a/fs/readdir.c b/fs/readdir.c
index 2f6a4534e0df..d26d5ea4de7b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,9 +20,23 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/compat.h>
-
#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+/*
+ * Note the "unsafe_put_user() semantics: we goto a
+ * label for errors.
+ */
+#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \
+ char __user *dst = (_dst); \
+ const char *src = (_src); \
+ size_t len = (_len); \
+ unsafe_put_user(0, dst+len, label); \
+ unsafe_copy_to_user(dst, src, len, label); \
+} while (0)
+
+
int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
@@ -65,6 +79,40 @@ out:
EXPORT_SYMBOL(iterate_dir);
/*
+ * POSIX says that a dirent name cannot contain NULL or a '/'.
+ *
+ * It's not 100% clear what we should really do in this case.
+ * The filesystem is clearly corrupted, but returning a hard
+ * error means that you now don't see any of the other names
+ * either, so that isn't a perfect alternative.
+ *
+ * And if you return an error, what error do you use? Several
+ * filesystems seem to have decided on EUCLEAN being the error
+ * code for EFSCORRUPTED, and that may be the error to use. Or
+ * just EIO, which is perhaps more obvious to users.
+ *
+ * In order to see the other file names in the directory, the
+ * caller might want to make this a "soft" error: skip the
+ * entry, and return the error at the end instead.
+ *
+ * Note that this should likely do a "memchr(name, 0, len)"
+ * check too, since that would be filesystem corruption as
+ * well. However, that case can't actually confuse user space,
+ * which has to do a strlen() on the name anyway to find the
+ * filename length, and the above "soft error" worry means
+ * that it's probably better left alone until we have that
+ * issue clarified.
+ */
+static int verify_dirent_name(const char *name, int len)
+{
+ if (!len)
+ return -EIO;
+ if (memchr(name, '/', len))
+ return -EIO;
+ return 0;
+}
+
+/*
* Traditional linux readdir() handling..
*
* "count=1" is a special case, meaning that the buffer is one
@@ -173,6 +221,9 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
+ buf->error = verify_dirent_name(name, namlen);
+ if (unlikely(buf->error))
+ return buf->error;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
return -EINVAL;
@@ -182,28 +233,31 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
return -EOVERFLOW;
}
dirent = buf->previous;
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user(d_ino, &dirent->d_ino))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
- goto efault;
- if (__put_user(d_type, (char __user *) dirent + reclen - 1))
+ if (dirent && signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Note! This range-checks 'previous' (which may be NULL).
+ * The real range was checked in getdents
+ */
+ if (!user_access_begin(dirent, sizeof(*dirent)))
goto efault;
+ if (dirent)
+ unsafe_put_user(offset, &dirent->d_off, efault_end);
+ dirent = buf->current_dir;
+ unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
+ unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
+ unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end);
+ unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
+ user_access_end();
+
buf->previous = dirent;
dirent = (void __user *)dirent + reclen;
buf->current_dir = dirent;
buf->count -= reclen;
return 0;
+efault_end:
+ user_access_end();
efault:
buf->error = -EFAULT;
return -EFAULT;
@@ -259,34 +313,38 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
+ buf->error = verify_dirent_name(name, namlen);
+ if (unlikely(buf->error))
+ return buf->error;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
return -EINVAL;
dirent = buf->previous;
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user(ino, &dirent->d_ino))
- goto efault;
- if (__put_user(0, &dirent->d_off))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (__put_user(d_type, &dirent->d_type))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
+ if (dirent && signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Note! This range-checks 'previous' (which may be NULL).
+ * The real range was checked in getdents
+ */
+ if (!user_access_begin(dirent, sizeof(*dirent)))
goto efault;
+ if (dirent)
+ unsafe_put_user(offset, &dirent->d_off, efault_end);
+ dirent = buf->current_dir;
+ unsafe_put_user(ino, &dirent->d_ino, efault_end);
+ unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
+ unsafe_put_user(d_type, &dirent->d_type, efault_end);
+ unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
+ user_access_end();
+
buf->previous = dirent;
dirent = (void __user *)dirent + reclen;
buf->current_dir = dirent;
buf->count -= reclen;
return 0;
+efault_end:
+ user_access_end();
efault:
buf->error = -EFAULT;
return -EFAULT;
diff --git a/fs/statfs.c b/fs/statfs.c
index eea7af6f2f22..2616424012ea 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -318,19 +318,10 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *,
static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
{
struct compat_statfs64 buf;
- if (sizeof(ubuf->f_bsize) == 4) {
- if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
- kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /* f_files and f_ffree may be -1; it's okay
- * to stuff that into 32 bits */
- if (kbuf->f_files != 0xffffffffffffffffULL
- && (kbuf->f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (kbuf->f_ffree != 0xffffffffffffffffULL
- && (kbuf->f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
+
+ if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+
memset(&buf, 0, sizeof(struct compat_statfs64));
buf.f_type = kbuf->f_type;
buf.f_bsize = kbuf->f_bsize;
diff --git a/fs/super.c b/fs/super.c
index f627b7c53d2b..cfadab2cbf35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1300,6 +1300,7 @@ int get_tree_bdev(struct fs_context *fc,
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ blkdev_put(bdev, mode);
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
return -EBUSY;
}
@@ -1308,8 +1309,10 @@ int get_tree_bdev(struct fs_context *fc,
fc->sget_key = bdev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s))
+ if (IS_ERR(s)) {
+ blkdev_put(bdev, mode);
return PTR_ERR(s);
+ }
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 9fc14e38927f..0caa151cae4e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -16,11 +16,11 @@
#include <linux/namei.h>
#include <linux/tracefs.h>
#include <linux/fsnotify.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
-#include <linux/security.h>
#define TRACEFS_DEFAULT_MODE 0700
@@ -28,25 +28,6 @@ static struct vfsmount *tracefs_mount;
static int tracefs_mount_count;
static bool tracefs_registered;
-static int default_open_file(struct inode *inode, struct file *filp)
-{
- struct dentry *dentry = filp->f_path.dentry;
- struct file_operations *real_fops;
- int ret;
-
- if (!dentry)
- return -EINVAL;
-
- ret = security_locked_down(LOCKDOWN_TRACEFS);
- if (ret)
- return ret;
-
- real_fops = dentry->d_fsdata;
- if (!real_fops->open)
- return 0;
- return real_fops->open(inode, filp);
-}
-
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -241,12 +222,6 @@ static int tracefs_apply_options(struct super_block *sb)
return 0;
}
-static void tracefs_destroy_inode(struct inode *inode)
-{
- if (S_ISREG(inode->i_mode))
- kfree(inode->i_fop);
-}
-
static int tracefs_remount(struct super_block *sb, int *flags, char *data)
{
int err;
@@ -283,7 +258,6 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root)
static const struct super_operations tracefs_super_operations = {
.statfs = simple_statfs,
.remount_fs = tracefs_remount,
- .destroy_inode = tracefs_destroy_inode,
.show_options = tracefs_show_options,
};
@@ -414,10 +388,12 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
- struct file_operations *proxy_fops;
struct dentry *dentry;
struct inode *inode;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!(mode & S_IFMT))
mode |= S_IFREG;
BUG_ON(!S_ISREG(mode));
@@ -430,20 +406,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return failed_creating(dentry);
- proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL);
- if (unlikely(!proxy_fops)) {
- iput(inode);
- return failed_creating(dentry);
- }
-
- if (!fops)
- fops = &tracefs_file_operations;
-
- dentry->d_fsdata = (void *)fops;
- memcpy(proxy_fops, fops, sizeof(*proxy_fops));
- proxy_fops->open = default_open_file;
inode->i_mode = mode;
- inode->i_fop = proxy_fops;
+ inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
d_instantiate(dentry, inode);
fsnotify_create(dentry->d_parent->d_inode, dentry);
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5de296b34ab1..14fbdf22b7e7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -28,12 +28,11 @@ xfs_get_aghdr_buf(
struct xfs_mount *mp,
xfs_daddr_t blkno,
size_t numblks,
- int flags,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
if (!bp)
return NULL;
@@ -345,7 +344,7 @@ xfs_ag_init_hdr(
{
struct xfs_buf *bp;
- bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+ bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
if (!bp)
return -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b9f019603d0b..f0089e862216 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -826,32 +826,17 @@ xfs_attr_shortform_to_leaf(
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
- if (error) {
- /*
- * If we hit an IO error middle of the transaction inside
- * grow_inode(), we may have inconsistent data. Bail out.
- */
- if (error == -EIO)
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
ASSERT(blkno == 0);
error = xfs_attr3_leaf_create(args, blkno, &bp);
- if (error) {
- /* xfs_attr3_leaf_create may not have instantiated a block */
- if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4edc25a2ba80..02469d59c787 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -792,6 +792,7 @@ out_root_realloc:
*/
void
xfs_bmap_local_to_extents_empty(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork)
{
@@ -808,6 +809,7 @@ xfs_bmap_local_to_extents_empty(
ifp->if_u1.if_root = NULL;
ifp->if_height = 0;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -840,7 +842,7 @@ xfs_bmap_local_to_extents(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags = XFS_ILOG_CORE;
goto done;
}
@@ -887,7 +889,7 @@ xfs_bmap_local_to_extents(
/* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
ifp->if_u1.if_root = NULL;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 5bb446d80542..e2798c6f3a5f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -182,7 +182,8 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
-void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9595ced393dc..49e4bc39e7bb 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_to_block(
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 39dd2b908106..e9371a8e0e26 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -366,11 +366,11 @@ struct xfs_bulkstat {
uint64_t bs_blocks; /* number of blocks */
uint64_t bs_xflags; /* extended flags */
- uint64_t bs_atime; /* access time, seconds */
- uint64_t bs_mtime; /* modify time, seconds */
+ int64_t bs_atime; /* access time, seconds */
+ int64_t bs_mtime; /* modify time, seconds */
- uint64_t bs_ctime; /* inode change time, seconds */
- uint64_t bs_btime; /* creation time, seconds */
+ int64_t bs_ctime; /* inode change time, seconds */
+ int64_t bs_btime; /* creation time, seconds */
uint32_t bs_gen; /* generation count */
uint32_t bs_uid; /* user id */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 93b3793bc5b3..0cab11a5d390 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -341,7 +341,6 @@ xchk_refcountbt_rec(
xfs_extlen_t len;
xfs_nlink_t refcount;
bool has_cowflag;
- int error = 0;
bno = be32_to_cpu(rec->refc.rc_startblock);
len = be32_to_cpu(rec->refc.rc_blockcount);
@@ -366,7 +365,7 @@ xchk_refcountbt_rec(
xchk_refcountbt_xref(bs->sc, bno, len, refcount);
- return error;
+ return 0;
}
/* Make sure we have as many refc blocks as the rmap says. */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0910cb75b65d..4f443703065e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -864,6 +864,7 @@ xfs_alloc_file_space(
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
+ xfs_fileoff_t endoffset_fsb;
int nimaps;
int quota_flag;
int rt;
@@ -891,7 +892,8 @@ xfs_alloc_file_space(
imapp = &imaps[0];
nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
- allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+ endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+ allocatesize_fsb = endoffset_fsb - startoffset_fsb;
/*
* Allocate file space until done or until there is an error
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 21c243622a79..0abba171aa89 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -345,6 +345,15 @@ xfs_buf_allocate_memory(
unsigned short page_count, i;
xfs_off_t start, end;
int error;
+ xfs_km_flags_t kmflag_mask = 0;
+
+ /*
+ * assure zeroed buffer for non-read cases.
+ */
+ if (!(flags & XBF_READ)) {
+ kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
+ }
/*
* for buffers that are contained within a single page, just allocate
@@ -354,7 +363,8 @@ xfs_buf_allocate_memory(
size = BBTOB(bp->b_length);
if (size < PAGE_SIZE) {
int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
- bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
+ bp->b_addr = kmem_alloc_io(size, align_mask,
+ KM_NOFS | kmflag_mask);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a2beee9f74da..641d07f30a27 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1443,7 +1443,7 @@ xlog_alloc_log(
prev_iclog = iclog;
iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
- KM_MAYFAIL);
+ KM_MAYFAIL | KM_ZERO);
if (!iclog->ic_data)
goto out_free_iclog;
#ifdef DEBUG
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 508319039dce..c1a514ffff55 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -127,7 +127,7 @@ xlog_alloc_buffer(
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
}
/*