aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/callback.c1
-rw-r--r--fs/afs/dir.c7
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/aio.c10
-rw-r--r--fs/autofs/expire.c5
-rw-r--r--fs/btrfs/block-group.c1
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/delalloc-space.c7
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/file.c56
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c45
-rw-r--r--fs/btrfs/ioctl.c12
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/space-info.c21
-rw-r--r--fs/btrfs/tree-checker.c8
-rw-r--r--fs/btrfs/tree-log.c36
-rw-r--r--fs/btrfs/volumes.c7
-rw-r--r--fs/ceph/caps.c10
-rw-r--r--fs/ceph/dir.c15
-rw-r--r--fs/ceph/file.c44
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/mds_client.c21
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/cifs/cifsfs.c30
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c22
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c62
-rw-r--r--fs/cifs/inode.c8
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/smb1ops.c3
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2ops.c3
-rw-r--r--fs/cifs/smb2pdu.c14
-rw-r--r--fs/cifs/smb2pdu.h1
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/transport.c42
-rw-r--r--fs/configfs/symlink.c2
-rw-r--r--fs/cramfs/inode.c4
-rw-r--r--fs/dax.c5
-rw-r--r--fs/direct-io.c3
-rw-r--r--fs/ecryptfs/inode.c84
-rw-r--r--fs/exportfs/expfs.c31
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c20
-rw-r--r--fs/fuse/Makefile3
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/dir.c16
-rw-r--r--fs/fuse/file.c14
-rw-r--r--fs/fuse/fuse_i.h4
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/fuse/virtio_fs.c169
-rw-r--r--fs/gfs2/ops_fstype.c21
-rw-r--r--fs/io-wq.c1065
-rw-r--r--fs/io-wq.h74
-rw-r--r--fs/io_uring.c2455
-rw-r--r--fs/libfs.c140
-rw-r--r--fs/namespace.c15
-rw-r--r--fs/nfs/delegation.c12
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/direct.c106
-rw-r--r--fs/nfs/nfs4proc.c8
-rw-r--r--fs/nfs/write.c5
-rw-r--r--fs/ocfs2/aops.c25
-rw-r--r--fs/ocfs2/file.c136
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.c3
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/open.c6
-rw-r--r--fs/pipe.c6
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/page.c28
-rw-r--r--fs/readdir.c48
-rw-r--r--fs/super.c5
-rw-r--r--fs/tracefs/inode.c46
-rw-r--r--fs/xfs/libxfs/xfs_ag.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/scrub/refcount.c3
-rw-r--r--fs/xfs/xfs_bmap_util.c4
-rw-r--r--fs/xfs/xfs_buf.c12
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
94 files changed, 3753 insertions, 1461 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 2501e6f1f965..7b623e9fc1b0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -322,4 +322,7 @@ source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
source "fs/unicode/Kconfig"
+config IO_WQ
+ bool
+
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 14231b4cf383..1148c555c4d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
+obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 6cdd7047c809..2dca8df1a18d 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -312,7 +312,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
_enter("%p,%zu,", server, count);
ASSERT(server != NULL);
- ASSERTCMP(count, <=, AFSCBMAX);
/* TODO: Sort the callback break list by volume ID */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index cc12772d0a4d..497f979018c2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -803,7 +803,12 @@ success:
continue;
if (cookie->inodes[i]) {
- afs_vnode_commit_status(&fc, AFS_FS_I(cookie->inodes[i]),
+ struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]);
+
+ if (test_bit(AFS_VNODE_UNSET, &iv->flags))
+ continue;
+
+ afs_vnode_commit_status(&fc, iv,
scb->cb_break, NULL, scb);
continue;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 0e5269374ac1..61498d9f06ef 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -637,6 +637,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
call->need_attention = false;
__set_current_state(TASK_RUNNING);
afs_deliver_to_call(call);
+ timeout = rtt2;
continue;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f18911e8d770..488641b1a418 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -435,6 +435,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
/* fill in the superblock */
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
if (!as->dyn_root)
diff --git a/fs/aio.c b/fs/aio.c
index 01e0fb9ae45a..0d9a559d488c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2179,7 +2179,7 @@ SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
#ifdef CONFIG_COMPAT
struct __compat_aio_sigset {
- compat_sigset_t __user *sigmask;
+ compat_uptr_t sigmask;
compat_size_t sigsetsize;
};
@@ -2193,7 +2193,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct old_timespec32 __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
- struct __compat_aio_sigset ksig = { NULL, };
+ struct __compat_aio_sigset ksig = { 0, };
struct timespec64 t;
bool interrupted;
int ret;
@@ -2204,7 +2204,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
if (ret)
return ret;
@@ -2228,7 +2228,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
struct __kernel_timespec __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
- struct __compat_aio_sigset ksig = { NULL, };
+ struct __compat_aio_sigset ksig = { 0, };
struct timespec64 t;
bool interrupted;
int ret;
@@ -2239,7 +2239,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
if (ret)
return ret;
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 2866fabf497f..91f5787dae7c 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -459,9 +459,10 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
*/
how &= ~AUTOFS_EXP_LEAVES;
found = should_expire(expired, mnt, timeout, how);
- if (!found || found != expired)
- /* Something has changed, continue */
+ if (found != expired) { // something has changed, continue
+ dput(found);
goto next;
+ }
if (expired != dentry)
dput(dentry);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bf7e3f23bba7..670700cb1110 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1761,6 +1761,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_err(info,
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
cache->key.objectid);
+ btrfs_put_block_group(cache);
ret = -EINVAL;
goto error;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 19d669d12ca1..fe2b8765d9e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -734,8 +734,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *fixup_workers;
struct btrfs_workqueue *delayed_workers;
- /* the extent workers do delayed refs on the extent allocation tree */
- struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
u32 thread_pool_size;
@@ -2489,8 +2487,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index d949d7d2abed..db9f2c58eb4a 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -381,7 +381,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
out_qgroup:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
out_fail:
- btrfs_inode_rsv_release(inode, true);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
@@ -418,7 +417,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -426,8 +424,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned num_extents;
@@ -441,7 +438,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode, qgroup_free);
+ btrfs_inode_rsv_release(inode, true);
}
/**
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 044981cf6df9..402b61bf345c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2008,7 +2008,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
- btrfs_destroy_workqueue(fs_info->extent_workers);
/*
* Now that all other work queues are destroyed, we can safely destroy
* the queues used for metadata I/O, since tasks from those other work
@@ -2214,10 +2213,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
- fs_info->extent_workers =
- btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 8);
if (!(fs_info->workers && fs_info->delalloc_workers &&
fs_info->submit_workers && fs_info->flush_workers &&
@@ -2228,7 +2223,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
return -ENOMEM;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8fe4eb7e5045..435a502a3226 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
- struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
@@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
+ struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
@@ -1692,7 +1692,7 @@ again:
force_page_uptodate);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
break;
}
@@ -1704,7 +1704,7 @@ again:
if (extents_locked == -EAGAIN)
goto again;
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
ret = extents_locked;
break;
}
@@ -1758,11 +1758,21 @@ again:
if (copied > 0)
ret = btrfs_dirty_pages(inode, pages, dirty_pages,
pos, copied, &cached_state);
+
+ /*
+ * If we have not locked the extent range, because the range's
+ * start offset is >= i_size, we might still have a non-NULL
+ * cached extent state, acquired while marking the extent range
+ * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * possible cached extent state to avoid a memory leak.
+ */
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
- true);
+ else
+ free_extent_state(cached_state);
+
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
@@ -2057,25 +2067,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
- u64 len;
- /*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges, and assertion failures during
- * hole detection.
- */
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
-
- /*
- * The range length can be represented by u64, we have to do the typecasts
- * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
- */
- len = (u64)end - (u64)start + 1;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
@@ -2102,6 +2094,19 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
+ * If the inode needs a full sync, make sure we use a full range to
+ * avoid log tree corruption, due to hole detection racing with ordered
+ * extent completion for adjacent ranges, and assertion failures during
+ * hole detection. Do this while holding the inode lock, to avoid races
+ * with other tasks.
+ */
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
* any such pages does not start while we are logging the inode, because
@@ -2128,8 +2133,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
/*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
+ *
+ * Also, the range length can be represented by u64, we have to do the
+ * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
*/
- ret = btrfs_wait_ordered_range(inode, start, len);
+ ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
if (ret) {
up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 63cad7865d75..37345fb6191d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -501,13 +501,13 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0546401bc0a..015910079e73 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -474,6 +474,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
+ u64 i_size;
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -488,7 +489,19 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, i_size_read(inode), end + 1);
+ /*
+ * We need to save i_size before now because it could change in between
+ * us evaluating the size and assigning it. This is because we lock and
+ * unlock the page in truncate and fallocate, and then modify the i_size
+ * later on.
+ *
+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
+ * does that for us.
+ */
+ barrier();
+ i_size = i_size_read(inode);
+ barrier();
+ actual_end = min_t(u64, i_size, end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -2206,7 +2219,7 @@ again:
ClearPageChecked(page);
set_page_dirty(page);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -4951,7 +4964,7 @@ again:
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
goto out;
}
@@ -5018,7 +5031,7 @@ out_unlock:
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
@@ -6305,13 +6318,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
u32 sizes[2];
int nitems = name ? 2 : 1;
unsigned long ptr;
+ unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
+ nofs_flag = memalloc_nofs_save();
inode = new_inode(fs_info->sb);
+ memalloc_nofs_restore(nofs_flag);
if (!inode) {
btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
@@ -8706,7 +8722,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
out:
if (wakeup)
@@ -9056,7 +9072,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
if (!ret2) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
@@ -9065,7 +9081,7 @@ again:
out_unlock:
unlock_page(page);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
@@ -9728,6 +9744,18 @@ out_fail:
commit_transaction = true;
}
if (commit_transaction) {
+ /*
+ * We may have set commit_transaction when logging the new name
+ * in the destination root, in which case we left the source
+ * root context in the list of log contextes. So make sure we
+ * remove it to avoid invalid memory accesses, since the context
+ * was allocated in our stack frame.
+ */
+ if (sync_log_root) {
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx_root.list);
+ mutex_unlock(&root->log_mutex);
+ }
ret = btrfs_commit_transaction(trans);
} else {
int ret2;
@@ -9741,6 +9769,9 @@ out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
+ ASSERT(list_empty(&ctx_root.list));
+ ASSERT(list_empty(&ctx_dest.list));
+
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index de730e56d3f5..23272d9154f3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1360,8 +1360,7 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
out:
@@ -1372,8 +1371,7 @@ out:
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
@@ -4197,9 +4195,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
u64 transid;
int ret;
- btrfs_warn(root->fs_info,
- "START_SYNC ioctl is deprecated and will be removed in kernel 5.7");
-
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
@@ -4227,9 +4222,6 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
{
u64 transid;
- btrfs_warn(fs_info,
- "WAIT_SYNC ioctl is deprecated and will be removed in kernel 5.7");
-
if (argp) {
if (copy_from_user(&transid, argp, sizeof(transid)))
return -EFAULT;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c4bb69941c77..3ad151655eb8 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3629,7 +3629,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
+ trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
@@ -3676,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
+ trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
num_bytes, type);
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e87cbdad02a3..b57f3618e58e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret;
+ int i = 0, tree_block_level = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 00504657b602..5cd42b66818c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3277,6 +3277,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!page) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ PAGE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -3297,7 +3299,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
ret = -EIO;
goto out;
}
@@ -3326,7 +3328,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
clear_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
@@ -3342,8 +3344,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f3215028235c..123ac54af071 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5085,7 +5085,7 @@ static int clone_range(struct send_ctx *sctx,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- u64 clone_src_i_size;
+ u64 clone_src_i_size = 0;
/*
* Prevent cloning from a zero offset with a length matching the sector
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 98dc092a905e..e8a4b0ebe97f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -893,6 +893,15 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
while (ticket->bytes > 0 && ticket->error == 0) {
ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
if (ret) {
+ /*
+ * Delete us from the list. After we unlock the space
+ * info, we don't want the async reclaim job to reserve
+ * space for this ticket. If that would happen, then the
+ * ticket's task would not known that space was reserved
+ * despite getting an error, resulting in a space leak
+ * (bytes_may_use counter of our space_info).
+ */
+ list_del_init(&ticket->list);
ticket->error = -EINTR;
break;
}
@@ -945,12 +954,24 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
ret = ticket->error;
if (ticket->bytes || ticket->error) {
+ /*
+ * Need to delete here for priority tickets. For regular tickets
+ * either the async reclaim job deletes the ticket from the list
+ * or we delete it ourselves at wait_reserve_ticket().
+ */
list_del_init(&ticket->list);
if (!ret)
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
ASSERT(list_empty(&ticket->list));
+ /*
+ * Check that we can't have an error set if the reservation succeeded,
+ * as that would confuse tasks and lead them to error out without
+ * releasing reserved space (if an error happens the expectation is that
+ * space wasn't reserved at all).
+ */
+ ASSERT(!(ticket->bytes == 0 && ticket->error));
return ret;
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43e488f5d063..076d5b8014fb 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -686,9 +686,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot,
static int check_dev_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
- struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dev_item *ditem;
- u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK);
if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
dev_item_err(leaf, slot,
@@ -696,12 +694,6 @@ static int check_dev_item(struct extent_buffer *leaf,
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
- if (key->offset > max_devid) {
- dev_item_err(leaf, slot,
- "invalid devid: has=%llu expect=[0, %llu]",
- key->offset, max_devid);
- return -EUCLEAN;
- }
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
if (btrfs_device_id(leaf, ditem) != key->offset) {
dev_item_err(leaf, slot,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 29b82a795522..8a6cc600bf18 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2932,7 +2932,8 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans,
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -3104,18 +3106,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
- * Update or create log root item under the root's log_mutex to prevent
- * races with concurrent log syncs that can lead to failure to update
- * log root item because it was not created yet.
- */
- ret = update_log_root(trans, log);
- /*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
@@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&log_root_tree->log_writer_wait);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cdd7af424033..e04409f85063 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3845,7 +3845,11 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return is_power_of_2(flags);
+ /*
+ * Don't use is_power_of_2(unsigned long) because it won't work
+ * for the single profile (1ULL << 48) on 32-bit CPUs.
+ */
+ return flags != 0 && (flags & (flags - 1)) == 0;
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -4963,6 +4967,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
+ devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d3b9c9d5c1bd..f5a38910a82b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1058,6 +1058,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ /* remove from inode's cap rbtree, and clear auth cap */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
/* remove from session list */
spin_lock(&session->s_cap_lock);
if (session->s_cap_iterator == cap) {
@@ -1091,11 +1096,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
spin_unlock(&session->s_cap_lock);
- /* remove from inode list */
- rb_erase(&cap->ci_node, &ci->i_caps);
- if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
if (removed)
ceph_put_cap(mdsc, cap);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4ca0b8ff9a72..d17a789fd856 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1553,36 +1553,37 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int valid = 0;
struct dentry *parent;
- struct inode *dir;
+ struct inode *dir, *inode;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
+ inode = d_inode_rcu(dentry);
} else {
parent = dget_parent(dentry);
dir = d_inode(parent);
+ inode = d_inode(dentry);
}
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
- dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
+ dentry, inode, ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
- dentry, d_inode(dentry));
+ dentry, inode);
valid = 1;
- } else if (d_really_is_positive(dentry) &&
- ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
+ } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
valid = 1;
} else {
valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
if (valid || dir_lease_is_valid(dir, dentry)) {
- if (d_really_is_positive(dentry))
- valid = ceph_is_any_caps(d_inode(dentry));
+ if (inode)
+ valid = ceph_is_any_caps(inode);
else
valid = 1;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d277f71abe0b..8de633964dc3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -462,6 +462,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ } else if (!d_in_lookup(dentry)) {
+ /* If it's not being looked up, it's negative */
+ return -ENOENT;
}
/* do the open */
@@ -750,6 +753,9 @@ static void ceph_aio_complete(struct inode *inode,
if (!atomic_dec_and_test(&aio_req->pending_reqs))
return;
+ if (aio_req->iocb->ki_flags & IOCB_DIRECT)
+ inode_dio_end(inode);
+
ret = aio_req->error;
if (!ret)
ret = aio_req->total_len;
@@ -1088,6 +1094,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
CEPH_CAP_FILE_RD);
list_splice(&aio_req->osd_reqs, &osd_reqs);
+ inode_dio_begin(inode);
while (!list_empty(&osd_reqs)) {
req = list_first_entry(&osd_reqs,
struct ceph_osd_request,
@@ -1261,14 +1268,24 @@ again:
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_start_io_direct(inode);
+ else
+ ceph_start_io_read(inode);
+
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
&got, &pinned_page);
- if (ret < 0)
+ if (ret < 0) {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_read(inode);
return ret;
+ }
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) ||
@@ -1280,16 +1297,12 @@ again:
if (ci->i_inline_version == CEPH_INLINE_NONE) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
- ceph_start_io_direct(inode);
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
- ceph_end_io_direct(inode);
if (ret >= 0 && ret < len)
retry_op = CHECK_EOF;
} else {
- ceph_start_io_read(inode);
ret = ceph_sync_read(iocb, to, &retry_op);
- ceph_end_io_read(inode);
}
} else {
retry_op = READ_INLINE;
@@ -1300,11 +1313,10 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
ceph_add_rw_context(fi, &rw_ctx);
- ceph_start_io_read(inode);
ret = generic_file_read_iter(iocb, to);
- ceph_end_io_read(inode);
ceph_del_rw_context(fi, &rw_ctx);
}
+
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
if (pinned_page) {
@@ -1312,6 +1324,12 @@ again:
pinned_page = NULL;
}
ceph_put_cap_refs(ci, got);
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_read(inode);
+
if (retry_op > HAVE_RETRIED && ret >= 0) {
int statret;
struct page *page = NULL;
@@ -1956,10 +1974,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
return -EOPNOTSUPP;
+ /*
+ * Striped file layouts require that we copy partial objects, but the
+ * OSD copy-from operation only supports full-object copies. Limit
+ * this to non-striped file layouts for now.
+ */
if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
- (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) ||
- (src_ci->i_layout.object_size != dst_ci->i_layout.object_size))
+ (src_ci->i_layout.stripe_count != 1) ||
+ (dst_ci->i_layout.stripe_count != 1) ||
+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
+ dout("Invalid src/dst files layout\n");
return -EOPNOTSUPP;
+ }
if (len < src_ci->i_layout.object_size)
return -EOPNOTSUPP; /* no remote copy will be done */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9f135624ae47..c07407586ce8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1434,6 +1434,7 @@ retry_lookup:
dout(" final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct inode *dir = req->r_parent;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a8a8f84f3bbf..a5163296d9d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -384,8 +384,8 @@ static int parse_reply_info_readdir(void **p, void *end,
}
done:
- if (*p != end)
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
bad:
@@ -406,12 +406,10 @@ static int parse_reply_info_filelock(void **p, void *end,
goto bad;
info->filelock_reply = *p;
- *p += sizeof(*info->filelock_reply);
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
-
bad:
return -EIO;
}
@@ -425,18 +423,21 @@ static int parse_reply_info_create(void **p, void *end,
{
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
+ /* Malformed reply? */
if (*p == end) {
info->has_create_ino = false;
} else {
info->has_create_ino = true;
- info->ino = ceph_decode_64(p);
+ ceph_decode_64_safe(p, end, info->ino, bad);
}
+ } else {
+ if (*p != end)
+ goto bad;
}
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
-
bad:
return -EIO;
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index edfd643a8205..b47f43fc2d68 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -268,6 +268,7 @@ static int parse_fsopt_token(char *c, void *private)
}
break;
case Opt_fscache_uniq:
+#ifdef CONFIG_CEPH_FSCACHE
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from,
argstr[0].to-argstr[0].from,
@@ -276,7 +277,10 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
break;
- /* misc */
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_wsize:
if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
return -EINVAL;
@@ -353,10 +357,15 @@ static int parse_fsopt_token(char *c, void *private)
fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
break;
case Opt_fscache:
+#ifdef CONFIG_CEPH_FSCACHE
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = NULL;
break;
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
kfree(fsopt->fscache_uniq);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2e9c7f493f99..1a135d1b85bd 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -169,18 +169,32 @@ cifs_read_super(struct super_block *sb)
else
sb->s_maxbytes = MAX_NON_LFS;
- /* BB FIXME fix time_gran to be larger for LANMAN sessions */
- sb->s_time_gran = 100;
-
- if (tcon->unix_ext) {
- ts = cifs_NTtimeToUnix(0);
+ /*
+ * Some very old servers like DOS and OS/2 used 2 second granularity
+ * (while all current servers use 100ns granularity - see MS-DTYP)
+ * but 1 second is the maximum allowed granularity for the VFS
+ * so for old servers set time granularity to 1 second while for
+ * everything else (current servers) set it to 100ns.
+ */
+ if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) &&
+ ((tcon->ses->capabilities &
+ tcon->ses->server->vals->cap_nt_find) == 0) &&
+ !tcon->unix_ext) {
+ sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */
+ ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
sb->s_time_min = ts.tv_sec;
- ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
+ ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX),
+ cpu_to_le16(SMB_TIME_MAX), 0);
sb->s_time_max = ts.tv_sec;
} else {
- ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0);
+ /*
+ * Almost every server, including all SMB2+, uses DCE TIME
+ * ie 100 nanosecond units, since 1601. See MS-DTYP and MS-FSCC
+ */
+ sb->s_time_gran = 100;
+ ts = cifs_NTtimeToUnix(0);
sb->s_time_min = ts.tv_sec;
- ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), cpu_to_le16(SMB_TIME_MAX), 0);
+ ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX));
sb->s_time_max = ts.tv_sec;
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 2e960e1049db..d78bfcc19156 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1210,7 +1210,7 @@ struct cifs_search_info {
bool smallBuf:1; /* so we know which buf_release function to call */
};
-#define ACL_NO_MODE -1
+#define ACL_NO_MODE ((umode_t)(-1))
struct cifs_open_parms {
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
@@ -1391,6 +1391,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
struct cifsInodeInfo {
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
+ /*
+ * NOTE: Some code paths call down_read(lock_sem) twice, so
+ * we must always use use cifs_down_write() instead of down_write()
+ * for this semaphore to avoid deadlocks.
+ */
struct rw_semaphore lock_sem; /* protect the fields above */
/* BB add in lists for dirty pages i.e. write caching info for oplock */
struct list_head openFileList;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e53e9f62b87b..fe597d3d5208 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -170,6 +170,7 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
+extern void cifs_down_write(struct rw_semaphore *sem);
extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
struct file *file,
struct tcon_link *tlink,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2850c3ce4391..ccaa8bad336f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -564,9 +564,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+ kref_get(&mid_entry->refcount);
if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
mid_entry->mid_state = MID_RETRY_NEEDED;
list_move(&mid_entry->qhead, &retry_list);
+ mid_entry->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
mutex_unlock(&server->srv_mutex);
@@ -576,6 +578,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
+ cifs_mid_q_entry_release(mid_entry);
}
if (cifs_rdma_enabled(server)) {
@@ -895,8 +898,10 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
if (mid->mid_flags & MID_DELETED)
printk_once(KERN_WARNING
"trying to dequeue a deleted mid\n");
- else
+ else {
list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
spin_unlock(&GlobalMid_Lock);
}
@@ -966,8 +971,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
+ kref_get(&mid_entry->refcount);
mid_entry->mid_state = MID_SHUTDOWN;
list_move(&mid_entry->qhead, &dispose_list);
+ mid_entry->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
@@ -977,6 +984,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
+ cifs_mid_q_entry_release(mid_entry);
}
/* 1/8th of sec is more than enough time for them to exit */
msleep(125);
@@ -3882,8 +3890,12 @@ generic_ip_connect(struct TCP_Server_Info *server)
rc = socket->ops->connect(socket, saddr, slen,
server->noblockcnt ? O_NONBLOCK : 0);
-
- if (rc == -EINPROGRESS)
+ /*
+ * When mounting SMB root file systems, we do not want to block in
+ * connect. Otherwise bail out and then let cifs_reconnect() perform
+ * reconnect failover - if possible.
+ */
+ if (server->noblockcnt && rc == -EINPROGRESS)
rc = 0;
if (rc < 0) {
cifs_dbg(FYI, "Error %d connecting to server\n", rc);
@@ -4264,7 +4276,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
server->ops->qfs_tcon(*xid, tcon);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
if (tcon->fsDevInfo.DeviceCharacteristics &
- FILE_READ_ONLY_DEVICE)
+ cpu_to_le32(FILE_READ_ONLY_DEVICE))
cifs_dbg(VFS, "mounted to read only share\n");
else if ((cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_RW_CACHE) == 0)
@@ -4445,7 +4457,7 @@ static int setup_dfs_tgt_conn(const char *path,
int rc;
struct dfs_info3_param ref = {0};
char *mdata = NULL, *fake_devname = NULL;
- struct smb_vol fake_vol = {0};
+ struct smb_vol fake_vol = {NULL};
cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index dd5ac841aefa..7ce689d31aa2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -738,10 +738,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
static int
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
+ struct inode *inode;
+
if (flags & LOOKUP_RCU)
return -ECHILD;
if (d_really_is_positive(direntry)) {
+ inode = d_inode(direntry);
+ if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
+ CIFS_I(inode)->time = 0; /* force reval */
+
if (cifs_revalidate_dentry(direntry))
return 0;
else {
@@ -752,7 +758,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
* attributes will have been updated by
* cifs_revalidate_dentry().
*/
- if (IS_AUTOMOUNT(d_inode(direntry)) &&
+ if (IS_AUTOMOUNT(inode) &&
!(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
spin_lock(&direntry->d_lock);
direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4b95700c507c..fa7b0fa72bb3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -253,6 +253,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
xid, fid);
+ if (rc) {
+ server->ops->close(xid, tcon, fid);
+ if (rc == -ESTALE)
+ rc = -EOPENSTALE;
+ }
+
out:
kfree(buf);
return rc;
@@ -275,6 +281,13 @@ cifs_has_mand_locks(struct cifsInodeInfo *cinode)
return has_locks;
}
+void
+cifs_down_write(struct rw_semaphore *sem)
+{
+ while (!down_write_trylock(sem))
+ msleep(10);
+}
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -300,7 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
INIT_LIST_HEAD(&fdlocks->locks);
fdlocks->cfile = cfile;
cfile->llist = fdlocks;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_add(&fdlocks->llist, &cinode->llist);
up_write(&cinode->lock_sem);
@@ -399,10 +412,11 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
bool oplock_break_cancelled;
spin_lock(&tcon->open_file_lock);
-
+ spin_lock(&cifsi->open_file_lock);
spin_lock(&cifs_file->file_info_lock);
if (--cifs_file->count > 0) {
spin_unlock(&cifs_file->file_info_lock);
+ spin_unlock(&cifsi->open_file_lock);
spin_unlock(&tcon->open_file_lock);
return;
}
@@ -415,9 +429,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open);
/* remove it from the lists */
- spin_lock(&cifsi->open_file_lock);
list_del(&cifs_file->flist);
- spin_unlock(&cifsi->open_file_lock);
list_del(&cifs_file->tlist);
atomic_dec(&tcon->num_local_opens);
@@ -434,6 +446,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
cifs_set_oplock_level(cifsi, 0);
}
+ spin_unlock(&cifsi->open_file_lock);
spin_unlock(&tcon->open_file_lock);
oplock_break_cancelled = wait_oplock_handler ?
@@ -458,7 +471,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
*/
- down_write(&cifsi->lock_sem);
+ cifs_down_write(&cifsi->lock_sem);
list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
list_del(&li->llist);
cifs_del_lock_waiters(li);
@@ -1021,7 +1034,7 @@ static void
cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
{
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
}
@@ -1043,7 +1056,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
try_again:
exist = false;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
lock->type, lock->flags, &conf_lock,
@@ -1066,7 +1079,7 @@ try_again:
(lock->blist.next == &lock->blist));
if (!rc)
goto try_again;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_del_init(&lock->blist);
}
@@ -1119,7 +1132,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
return rc;
try_again:
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1325,7 +1338,7 @@ cifs_push_locks(struct cifsFileInfo *cfile)
int rc = 0;
/* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1516,7 +1529,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
if (!buf)
return -ENOMEM;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
for (i = 0; i < 2; i++) {
cur = buf;
num = 0;
@@ -1840,13 +1853,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
{
struct cifsFileInfo *open_file = NULL;
struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
/* we could simply get the first_list_entry since write-only entries
are always at the end of the list but since the first entry might
have a close pending, we go through the whole list */
@@ -1858,7 +1870,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
/* found a good file */
/* lock it so it will not be closed on us */
cifsFileInfo_get(open_file);
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return open_file;
} /* else might as well continue, and look for
another, or simply have the caller reopen it
@@ -1866,7 +1878,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
} else /* write only file */
break; /* write only files are last so must be done */
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return NULL;
}
@@ -1877,7 +1889,6 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
{
struct cifsFileInfo *open_file, *inv_file = NULL;
struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
bool any_available = false;
int rc = -EBADF;
unsigned int refind = 0;
@@ -1897,16 +1908,15 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
}
cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
- tcon = cifs_sb_master_tcon(cifs_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
fsuid_only = false;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
refind_writable:
if (refind > MAX_REOPEN_ATT) {
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return rc;
}
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
@@ -1918,7 +1928,7 @@ refind_writable:
if (!open_file->invalidHandle) {
/* found a good writable file */
cifsFileInfo_get(open_file);
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
*ret_file = open_file;
return 0;
} else {
@@ -1938,7 +1948,7 @@ refind_writable:
cifsFileInfo_get(inv_file);
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
if (inv_file) {
rc = cifs_reopen_file(inv_file, false);
@@ -1953,7 +1963,7 @@ refind_writable:
cifsFileInfo_put(inv_file);
++refind;
inv_file = NULL;
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
goto refind_writable;
}
@@ -4461,17 +4471,15 @@ static int cifs_readpage(struct file *file, struct page *page)
static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *open_file;
- struct cifs_tcon *tcon =
- cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb));
- spin_lock(&tcon->open_file_lock);
+ spin_lock(&cifs_inode->open_file_lock);
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return 1;
}
}
- spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_inode->open_file_lock);
return 0;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3bae2e53f0b8..df9377828e2f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -414,6 +414,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -421,6 +422,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* if filetype is different, return error */
if (unlikely(((*pinode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*pinode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgiiu_exit;
}
@@ -933,6 +935,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if uniqueid is different, return error */
if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
@@ -940,6 +943,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* if filetype is different, return error */
if (unlikely(((*inode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
+ CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
goto cgii_exit;
}
@@ -2471,9 +2475,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid);
cifsFileInfo_put(wfile);
if (rc)
- return rc;
+ goto cifs_setattr_exit;
} else if (rc != -EBADF)
- return rc;
+ goto cifs_setattr_exit;
else
rc = 0;
}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 49c17ee18254..9b41436fb8db 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
{0, 0}
};
-static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
- {0, 0}
-};
-
/*
* Convert a string containing text IPv4 or IPv6 address to binary form.
*
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index b7421a096319..514810694c0f 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -171,6 +171,9 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
/* we do not want to loop forever */
last_mid = cur_mid;
cur_mid++;
+ /* avoid 0xFFFF MID */
+ if (cur_mid == 0xffff)
+ cur_mid++;
/*
* This nested loop looks more expensive than it is.
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index e6a1fc72018f..8b0b512c5792 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -145,7 +145,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
cur = buf;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
if (flock->fl_start > li->offset ||
(flock->fl_start + length) <
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4c0922596467..cd55af9b7cc5 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4084,6 +4084,7 @@ free_pages:
kfree(dw->ppages);
cifs_small_buf_release(dw->buf);
+ kfree(dw);
}
@@ -4157,7 +4158,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
dw->server = server;
dw->ppages = pages;
dw->len = len;
- queue_work(cifsiod_wq, &dw->decrypt);
+ queue_work(decrypt_wq, &dw->decrypt);
*num_mids = 0; /* worker thread takes care of finding mid */
return -1;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 85f9d614d968..05149862aea4 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -751,8 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
unsigned int num = *num_iovec;
iov[num].iov_base = create_posix_buf(mode);
- if (mode == -1)
- cifs_dbg(VFS, "illegal mode\n"); /* BB REMOVEME */
+ if (mode == ACL_NO_MODE)
+ cifs_dbg(FYI, "illegal mode\n");
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_posix);
@@ -2521,11 +2521,8 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock,
return rc;
}
- /* TODO: add handling for the mode on create */
- if (oparms->disposition == FILE_CREATE)
- cifs_dbg(VFS, "mode is 0x%x\n", oparms->mode); /* BB REMOVEME */
-
- if ((oparms->disposition == FILE_CREATE) && (oparms->mode != -1)) {
+ if ((oparms->disposition == FILE_CREATE) &&
+ (oparms->mode != ACL_NO_MODE)) {
if (n_iov > 2) {
struct create_context *ccontext =
(struct create_context *)iov[n_iov-1].iov_base;
@@ -3217,7 +3214,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
req->PersistentFileId = persistent_fid;
req->VolatileFileId = volatile_fid;
- req->OutputBufferLength = SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE;
+ req->OutputBufferLength =
+ cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
req->CompletionFilter = cpu_to_le32(completion_filter);
if (watch_tree)
req->Flags = cpu_to_le16(SMB2_WATCH_TREE);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index ea735d59c36e..0abfde6d0b05 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -838,6 +838,7 @@ struct create_durable_handle_reconnect_v2 {
struct create_context ccontext;
__u8 Name[8];
struct durable_reconnect_context_v2 dcontext;
+ __u8 Pad[4];
} __packed;
/* See MS-SMB2 2.2.13.2.5 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index da3a6d580808..71b2930b8e0b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -150,6 +150,10 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
bool is_fsctl, char *in_data, u32 indatalen,
__u32 max_response_size);
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
+extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, bool watch_tree,
+ u32 completion_filter);
+
extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 308ad0f495e1..ca3de62688d6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -86,22 +86,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
static void _cifs_mid_q_entry_release(struct kref *refcount)
{
- struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry,
- refcount);
-
- mempool_free(mid, cifs_mid_poolp);
-}
-
-void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
-{
- spin_lock(&GlobalMid_Lock);
- kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
- spin_unlock(&GlobalMid_Lock);
-}
-
-void
-DeleteMidQEntry(struct mid_q_entry *midEntry)
-{
+ struct mid_q_entry *midEntry =
+ container_of(refcount, struct mid_q_entry, refcount);
#ifdef CONFIG_CIFS_STATS2
__le16 command = midEntry->server->vals->lock_cmd;
__u16 smb_cmd = le16_to_cpu(midEntry->command);
@@ -166,6 +152,19 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
}
}
#endif
+
+ mempool_free(midEntry, cifs_mid_poolp);
+}
+
+void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
+{
+ spin_lock(&GlobalMid_Lock);
+ kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
+ spin_unlock(&GlobalMid_Lock);
+}
+
+void DeleteMidQEntry(struct mid_q_entry *midEntry)
+{
cifs_mid_q_entry_release(midEntry);
}
@@ -173,8 +172,10 @@ void
cifs_delete_mid(struct mid_q_entry *mid)
{
spin_lock(&GlobalMid_Lock);
- list_del_init(&mid->qhead);
- mid->mid_flags |= MID_DELETED;
+ if (!(mid->mid_flags & MID_DELETED)) {
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
spin_unlock(&GlobalMid_Lock);
DeleteMidQEntry(mid);
@@ -872,7 +873,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
rc = -EHOSTDOWN;
break;
default:
- list_del_init(&mid->qhead);
+ if (!(mid->mid_flags & MID_DELETED)) {
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
__func__, mid->mid, mid->mid_state);
rc = -EIO;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index dc5dbf6a81d7..cb61467478ca 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -101,7 +101,7 @@ static int create_link(struct config_item *parent_item,
}
target_sd->s_links++;
spin_unlock(&configfs_dirent_lock);
- ret = configfs_get_target_path(item, item, body);
+ ret = configfs_get_target_path(parent_item, item, body);
if (!ret)
ret = configfs_create_link(target_sd, parent_item->ci_dentry,
dentry, body);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index d12ea28836a5..2f04024c3588 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -958,8 +958,8 @@ static int cramfs_get_tree(struct fs_context *fc)
if (IS_ENABLED(CONFIG_CRAMFS_MTD)) {
ret = get_tree_mtd(fc, cramfs_mtd_fill_super);
- if (ret < 0)
- return ret;
+ if (!ret)
+ return 0;
}
if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
ret = get_tree_bdev(fc, cramfs_blkdev_fill_super);
diff --git a/fs/dax.c b/fs/dax.c
index 6bf81f931de3..2cc43cd914eb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -220,10 +220,11 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
for (;;) {
entry = xas_find_conflict(xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ return entry;
if (dax_entry_order(entry) < order)
return XA_RETRY_ENTRY;
- if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
- !dax_is_locked(entry))
+ if (!dax_is_locked(entry))
return entry;
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ae196784f487..9329ced91f1d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -241,9 +241,8 @@ void dio_warn_stale_pagecache(struct file *filp)
}
}
-/**
+/*
* dio_complete() - called when all DIO BIO I/O has been completed
- * @offset: the byte offset in the file of the completed operation
*
* This drops i_dio_count, lets interested parties know that a DIO operation
* has completed, and calculates the resulting return code for the operation.
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 18426f4855f1..e23752d9a79f 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -128,13 +128,20 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
struct dentry *lower_dir_dentry;
+ struct inode *lower_dir_inode;
int rc;
- dget(lower_dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ lower_dir_inode = d_inode(lower_dir_dentry);
+ inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ dget(lower_dentry); // don't even try to make the lower negative
+ if (lower_dentry->d_parent != lower_dir_dentry)
+ rc = -EINVAL;
+ else if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
@@ -142,10 +149,11 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
fsstack_copy_attr_times(dir, lower_dir_inode);
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
inode->i_ctime = dir->i_ctime;
- d_drop(dentry);
out_unlock:
- unlock_dir(lower_dir_dentry);
dput(lower_dentry);
+ inode_unlock(lower_dir_inode);
+ if (!rc)
+ d_drop(dentry);
return rc;
}
@@ -311,9 +319,9 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- struct inode *inode, *lower_inode = d_inode(lower_dentry);
+ struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct inode *inode, *lower_inode;
struct ecryptfs_dentry_info *dentry_info;
- struct vfsmount *lower_mnt;
int rc = 0;
dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
@@ -322,16 +330,23 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
return ERR_PTR(-ENOMEM);
}
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(lower_dentry->d_parent));
+ d_inode(path->dentry));
BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = lower_mnt;
+ dentry_info->lower_path.mnt = mntget(path->mnt);
dentry_info->lower_path.dentry = lower_dentry;
- if (d_really_is_negative(lower_dentry)) {
+ /*
+ * negative dentry can go positive under us here - its parent is not
+ * locked. That's OK and that could happen just as we return from
+ * ecryptfs_lookup() anyway. Just need to be careful and fetch
+ * ->d_inode only once - it's not stable here.
+ */
+ lower_inode = READ_ONCE(lower_dentry->d_inode);
+
+ if (!lower_inode) {
/* We want to add because we couldn't find in lower */
d_add(dentry, NULL);
return NULL;
@@ -512,22 +527,30 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct dentry *lower_dentry;
struct dentry *lower_dir_dentry;
+ struct inode *lower_dir_inode;
int rc;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- dget(dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- dget(lower_dentry);
- rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
- dput(lower_dentry);
- if (!rc && d_really_is_positive(dentry))
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ lower_dir_inode = d_inode(lower_dir_dentry);
+
+ inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ dget(lower_dentry); // don't even try to make the lower negative
+ if (lower_dentry->d_parent != lower_dir_dentry)
+ rc = -EINVAL;
+ else if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+ if (!rc) {
clear_nlink(d_inode(dentry));
- fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
- set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
- unlock_dir(lower_dir_dentry);
+ fsstack_copy_attr_times(dir, lower_dir_inode);
+ set_nlink(dir, lower_dir_inode->i_nlink);
+ }
+ dput(lower_dentry);
+ inode_unlock(lower_dir_inode);
if (!rc)
d_drop(dentry);
- dput(dentry);
return rc;
}
@@ -565,20 +588,22 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dentry;
struct dentry *lower_old_dir_dentry;
struct dentry *lower_new_dir_dentry;
- struct dentry *trap = NULL;
+ struct dentry *trap;
struct inode *target_inode;
if (flags)
return -EINVAL;
+ lower_old_dir_dentry = ecryptfs_dentry_to_lower(old_dentry->d_parent);
+ lower_new_dir_dentry = ecryptfs_dentry_to_lower(new_dentry->d_parent);
+
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
- dget(lower_old_dentry);
- dget(lower_new_dentry);
- lower_old_dir_dentry = dget_parent(lower_old_dentry);
- lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
target_inode = d_inode(new_dentry);
+
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ dget(lower_new_dentry);
rc = -EINVAL;
if (lower_old_dentry->d_parent != lower_old_dir_dentry)
goto out_lock;
@@ -606,11 +631,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_dir != old_dir)
fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
out_lock:
- unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
- dput(lower_new_dir_dentry);
- dput(lower_old_dir_dentry);
dput(lower_new_dentry);
- dput(lower_old_dentry);
+ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
return rc;
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 09bc68708d28..2dd55b172d57 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -519,26 +519,33 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
* inode is actually connected to the parent.
*/
err = exportfs_get_name(mnt, target_dir, nbuf, result);
- if (!err) {
- inode_lock(target_dir->d_inode);
- nresult = lookup_one_len(nbuf, target_dir,
- strlen(nbuf));
- inode_unlock(target_dir->d_inode);
- if (!IS_ERR(nresult)) {
- if (nresult->d_inode) {
- dput(result);
- result = nresult;
- } else
- dput(nresult);
- }
+ if (err) {
+ dput(target_dir);
+ goto err_result;
}
+ inode_lock(target_dir->d_inode);
+ nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+ if (!IS_ERR(nresult)) {
+ if (unlikely(nresult->d_inode != result->d_inode)) {
+ dput(nresult);
+ nresult = ERR_PTR(-ESTALE);
+ }
+ }
+ inode_unlock(target_dir->d_inode);
/*
* At this point we are done with the parent, but it's pinned
* by the child dentry anyway.
*/
dput(target_dir);
+ if (IS_ERR(nresult)) {
+ err = PTR_ERR(nresult);
+ goto err_result;
+ }
+ dput(result);
+ result = nresult;
+
/*
* And finally make sure the dentry is actually acceptable
* to NFSD.
diff --git a/fs/file.c b/fs/file.c
index 3da91a112bab..b241ea7f1aa4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -795,7 +795,7 @@ unsigned long __fdget_pos(unsigned int fd)
unsigned long v = __fdget(fd);
struct file *file = (struct file *)(v & ~3);
- if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+ if (file && !(file->f_mode & FMODE_STREAM)) {
if (file_count(file) > 1) {
v |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8aaa7eec7b74..335607b8c5c0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb,
if (work->auto_free)
kfree(work);
- if (done && atomic_dec_and_test(&done->cnt))
- wake_up_all(done->waitq);
+ if (done) {
+ wait_queue_head_t *waitq = done->waitq;
+
+ /* @done can't be accessed after the following dec */
+ if (atomic_dec_and_test(&done->cnt))
+ wake_up_all(waitq);
+ }
}
static void wb_queue_work(struct bdi_writeback *wb,
@@ -571,10 +576,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
spin_unlock(&inode->i_lock);
/*
- * A dying wb indicates that the memcg-blkcg mapping has changed
- * and a new wb is already serving the memcg. Switch immediately.
+ * A dying wb indicates that either the blkcg associated with the
+ * memcg changed or the associated memcg is dying. In the first
+ * case, a replacement wb should already be available and we should
+ * refresh the wb immediately. In the second case, trying to
+ * refresh will keep failing.
*/
- if (unlikely(wb_dying(wbc->wb)))
+ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
@@ -900,7 +908,7 @@ restart:
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
* @bdi_id: target bdi id
* @memcg_id: target memcg css id
- * @nr_pages: number of pages to write, 0 for best-effort dirty flushing
+ * @nr: number of pages to write, 0 for best-effort dirty flushing
* @reason: reason why some writeback work initiated
* @done: target wb_completion
*
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 6419a2b3510d..3e8cebfb59b7 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,6 +5,7 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
-obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
+obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+virtiofs-y += virtio_fs.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index dadd617d826c..ed1abc9e33cf 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -276,10 +276,12 @@ static void flush_bg_queue(struct fuse_conn *fc)
void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_iqueue *fiq = &fc->iq;
- bool async = req->args->end;
+ bool async;
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
+
+ async = req->args->end;
/*
* test_and_set_bit() implies smp_mb() between bit
* changing and below intr_entry check. Pairs with
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d572c900bb0f..54d638f9ba1c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -405,7 +405,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
else
fuse_invalidate_entry_cache(entry);
- fuse_advise_use_readdirplus(dir);
+ if (inode)
+ fuse_advise_use_readdirplus(dir);
return newent;
out_iput:
@@ -1521,6 +1522,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
is_truncate = true;
}
+ /* Flush dirty data/metadata before non-truncate SETATTR */
+ if (is_wb && S_ISREG(inode->i_mode) &&
+ attr->ia_valid &
+ (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
+ ATTR_TIMES_SET)) {
+ err = write_inode_now(inode, true);
+ if (err)
+ return err;
+
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+ }
+
if (is_truncate) {
fuse_set_nowrite(inode);
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0f0225686aee..db48a5cf8620 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -217,7 +217,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- bool lock_inode = (file->f_flags & O_TRUNC) &&
+ bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
@@ -225,16 +225,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
- if (lock_inode)
+ if (is_wb_truncate) {
inode_lock(inode);
+ fuse_set_nowrite(inode);
+ }
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (lock_inode)
+ if (is_wb_truncate) {
+ fuse_release_nowrite(inode);
inode_unlock(inode);
+ }
return err;
}
@@ -1997,7 +2001,7 @@ static int fuse_writepages_fill(struct page *page,
if (!data->ff) {
err = -EIO;
- data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+ data->ff = fuse_write_file_get(fc, fi);
if (!data->ff)
goto out_unlock;
}
@@ -2042,8 +2046,6 @@ static int fuse_writepages_fill(struct page *page,
* under writeback, so we can release the page lock.
*/
if (data->wpa == NULL) {
- struct fuse_inode *fi = get_fuse_inode(inode);
-
err = -ENOMEM;
wpa = fuse_writepage_args_alloc();
if (!wpa) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 956aeaf961ae..d148188cfca4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -479,6 +479,7 @@ struct fuse_fs_context {
bool destroy:1;
bool no_control:1;
bool no_force_umount:1;
+ bool no_mount_options:1;
unsigned int max_read;
unsigned int blksize;
const char *subtype;
@@ -713,6 +714,9 @@ struct fuse_conn {
/** Do not allow MNT_FORCE umount */
unsigned int no_force_umount:1;
+ /* Do not show mount options */
+ unsigned int no_mount_options:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e040e2a2b621..16aec32f7f3d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -558,6 +558,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
+ if (fc->no_mount_options)
+ return 0;
+
seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
if (fc->default_permissions)
@@ -1180,6 +1183,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
fc->destroy = ctx->destroy;
fc->no_control = ctx->no_control;
fc->no_force_umount = ctx->no_force_umount;
+ fc->no_mount_options = ctx->no_mount_options;
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 6af3f131e468..a5c86048b96e 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -30,6 +30,7 @@ struct virtio_fs_vq {
struct virtqueue *vq; /* protected by ->lock */
struct work_struct done_work;
struct list_head queued_reqs;
+ struct list_head end_reqs; /* End these requests */
struct delayed_work dispatch_work;
struct fuse_dev *fud;
bool connected;
@@ -54,6 +55,9 @@ struct virtio_fs_forget {
struct list_head list;
};
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+ struct fuse_req *req, bool in_flight);
+
static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
{
struct virtio_fs *fs = vq->vdev->priv;
@@ -66,6 +70,19 @@ static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
return &vq_to_fsvq(vq)->fud->pq;
}
+/* Should be called with fsvq->lock held. */
+static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+ fsvq->in_flight++;
+}
+
+/* Should be called with fsvq->lock held. */
+static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
+{
+ WARN_ON(fsvq->in_flight <= 0);
+ fsvq->in_flight--;
+}
+
static void release_virtio_fs_obj(struct kref *ref)
{
struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
@@ -109,22 +126,6 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
flush_delayed_work(&fsvq->dispatch_work);
}
-static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq)
-{
- struct virtio_fs_forget *forget;
-
- spin_lock(&fsvq->lock);
- while (1) {
- forget = list_first_entry_or_null(&fsvq->queued_reqs,
- struct virtio_fs_forget, list);
- if (!forget)
- break;
- list_del(&forget->list);
- kfree(forget);
- }
- spin_unlock(&fsvq->lock);
-}
-
static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
{
struct virtio_fs_vq *fsvq;
@@ -132,9 +133,6 @@ static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
for (i = 0; i < fs->nvqs; i++) {
fsvq = &fs->vqs[i];
- if (i == VQ_HIPRIO)
- drain_hiprio_queued_reqs(fsvq);
-
virtio_fs_drain_queue(fsvq);
}
}
@@ -253,14 +251,66 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work)
while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
kfree(req);
- fsvq->in_flight--;
+ dec_in_flight_req(fsvq);
}
} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
spin_unlock(&fsvq->lock);
}
-static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
+static void virtio_fs_request_dispatch_work(struct work_struct *work)
{
+ struct fuse_req *req;
+ struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+ dispatch_work.work);
+ struct fuse_conn *fc = fsvq->fud->fc;
+ int ret;
+
+ pr_debug("virtio-fs: worker %s called.\n", __func__);
+ while (1) {
+ spin_lock(&fsvq->lock);
+ req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req,
+ list);
+ if (!req) {
+ spin_unlock(&fsvq->lock);
+ break;
+ }
+
+ list_del_init(&req->list);
+ spin_unlock(&fsvq->lock);
+ fuse_request_end(fc, req);
+ }
+
+ /* Dispatch pending requests */
+ while (1) {
+ spin_lock(&fsvq->lock);
+ req = list_first_entry_or_null(&fsvq->queued_reqs,
+ struct fuse_req, list);
+ if (!req) {
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+ list_del_init(&req->list);
+ spin_unlock(&fsvq->lock);
+
+ ret = virtio_fs_enqueue_req(fsvq, req, true);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOSPC) {
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->queued_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ spin_unlock(&fsvq->lock);
+ return;
+ }
+ req->out.h.error = ret;
+ spin_lock(&fsvq->lock);
+ dec_in_flight_req(fsvq);
+ spin_unlock(&fsvq->lock);
+ pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n",
+ ret);
+ fuse_request_end(fc, req);
+ }
+ }
}
static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
@@ -286,6 +336,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
list_del(&forget->list);
if (!fsvq->connected) {
+ dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
kfree(forget);
continue;
@@ -307,13 +358,13 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
} else {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
ret);
+ dec_in_flight_req(fsvq);
kfree(forget);
}
spin_unlock(&fsvq->lock);
return;
}
- fsvq->in_flight++;
notify = virtqueue_kick_prepare(vq);
spin_unlock(&fsvq->lock);
@@ -452,7 +503,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
fuse_request_end(fc, req);
spin_lock(&fsvq->lock);
- fsvq->in_flight--;
+ dec_in_flight_req(fsvq);
spin_unlock(&fsvq->lock);
}
}
@@ -502,6 +553,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
+ INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs);
INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
virtio_fs_hiprio_dispatch_work);
spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
@@ -511,8 +563,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
spin_lock_init(&fs->vqs[i].lock);
INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
- virtio_fs_dummy_dispatch_work);
+ virtio_fs_request_dispatch_work);
INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
+ INIT_LIST_HEAD(&fs->vqs[i].end_reqs);
snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
"requests.%u", i - VQ_REQUEST);
callbacks[i] = virtio_fs_vq_done;
@@ -708,6 +761,7 @@ __releases(fiq->lock)
list_add_tail(&forget->list, &fsvq->queued_reqs);
schedule_delayed_work(&fsvq->dispatch_work,
msecs_to_jiffies(1));
+ inc_in_flight_req(fsvq);
} else {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
ret);
@@ -717,7 +771,7 @@ __releases(fiq->lock)
goto out;
}
- fsvq->in_flight++;
+ inc_in_flight_req(fsvq);
notify = virtqueue_kick_prepare(vq);
spin_unlock(&fsvq->lock);
@@ -819,7 +873,7 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg,
/* Add a request to a virtqueue and kick the device */
static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
- struct fuse_req *req)
+ struct fuse_req *req, bool in_flight)
{
/* requests need at least 4 elements */
struct scatterlist *stack_sgs[6];
@@ -835,6 +889,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
unsigned int i;
int ret;
bool notify;
+ struct fuse_pqueue *fpq;
/* Does the sglist fit on the stack? */
total_sgs = sg_count_fuse_req(req);
@@ -889,7 +944,17 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
goto out;
}
- fsvq->in_flight++;
+ /* Request successfully sent. */
+ fpq = &fsvq->fud->pq;
+ spin_lock(&fpq->lock);
+ list_add_tail(&req->list, fpq->processing);
+ spin_unlock(&fpq->lock);
+ set_bit(FR_SENT, &req->flags);
+ /* matches barrier in request_wait_answer() */
+ smp_mb__after_atomic();
+
+ if (!in_flight)
+ inc_in_flight_req(fsvq);
notify = virtqueue_kick_prepare(vq);
spin_unlock(&fsvq->lock);
@@ -915,9 +980,8 @@ __releases(fiq->lock)
{
unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
struct virtio_fs *fs;
- struct fuse_conn *fc;
struct fuse_req *req;
- struct fuse_pqueue *fpq;
+ struct virtio_fs_vq *fsvq;
int ret;
WARN_ON(list_empty(&fiq->pending));
@@ -928,44 +992,36 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
fs = fiq->priv;
- fc = fs->vqs[queue_id].fud->fc;
pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
__func__, req->in.h.opcode, req->in.h.unique,
req->in.h.nodeid, req->in.h.len,
fuse_len_args(req->args->out_numargs, req->args->out_args));
- fpq = &fs->vqs[queue_id].fud->pq;
- spin_lock(&fpq->lock);
- if (!fpq->connected) {
- spin_unlock(&fpq->lock);
- req->out.h.error = -ENODEV;
- pr_err("virtio-fs: %s disconnected\n", __func__);
- fuse_request_end(fc, req);
- return;
- }
- list_add_tail(&req->list, fpq->processing);
- spin_unlock(&fpq->lock);
- set_bit(FR_SENT, &req->flags);
- /* matches barrier in request_wait_answer() */
- smp_mb__after_atomic();
-
-retry:
- ret = virtio_fs_enqueue_req(&fs->vqs[queue_id], req);
+ fsvq = &fs->vqs[queue_id];
+ ret = virtio_fs_enqueue_req(fsvq, req, false);
if (ret < 0) {
if (ret == -ENOMEM || ret == -ENOSPC) {
- /* Virtqueue full. Retry submission */
- /* TODO use completion instead of timeout */
- usleep_range(20, 30);
- goto retry;
+ /*
+ * Virtqueue full. Retry submission from worker
+ * context as we might be holding fc->bg_lock.
+ */
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->queued_reqs);
+ inc_in_flight_req(fsvq);
+ schedule_delayed_work(&fsvq->dispatch_work,
+ msecs_to_jiffies(1));
+ spin_unlock(&fsvq->lock);
+ return;
}
req->out.h.error = ret;
pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
- spin_lock(&fpq->lock);
- clear_bit(FR_SENT, &req->flags);
- list_del_init(&req->list);
- spin_unlock(&fpq->lock);
- fuse_request_end(fc, req);
+
+ /* Can't end request in submission context. Use a worker */
+ spin_lock(&fsvq->lock);
+ list_add_tail(&req->list, &fsvq->end_reqs);
+ schedule_delayed_work(&fsvq->dispatch_work, 0);
+ spin_unlock(&fsvq->lock);
return;
}
}
@@ -992,6 +1048,7 @@ static int virtio_fs_fill_super(struct super_block *sb)
.destroy = true,
.no_control = true,
.no_force_umount = true,
+ .no_mount_options = true,
};
mutex_lock(&virtio_fs_mutex);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 681b44682b0d..18daf494abab 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1540,17 +1540,23 @@ static int gfs2_init_fs_context(struct fs_context *fc)
{
struct gfs2_args *args;
- args = kzalloc(sizeof(*args), GFP_KERNEL);
+ args = kmalloc(sizeof(*args), GFP_KERNEL);
if (args == NULL)
return -ENOMEM;
- args->ar_quota = GFS2_QUOTA_DEFAULT;
- args->ar_data = GFS2_DATA_DEFAULT;
- args->ar_commit = 30;
- args->ar_statfs_quantum = 30;
- args->ar_quota_quantum = 60;
- args->ar_errors = GFS2_ERRORS_DEFAULT;
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct gfs2_sbd *sdp = fc->root->d_sb->s_fs_info;
+ *args = sdp->sd_args;
+ } else {
+ memset(args, 0, sizeof(*args));
+ args->ar_quota = GFS2_QUOTA_DEFAULT;
+ args->ar_data = GFS2_DATA_DEFAULT;
+ args->ar_commit = 30;
+ args->ar_statfs_quantum = 30;
+ args->ar_quota_quantum = 60;
+ args->ar_errors = GFS2_ERRORS_DEFAULT;
+ }
fc->fs_private = args;
fc->ops = &gfs2_context_ops;
return 0;
@@ -1600,6 +1606,7 @@ static int gfs2_meta_get_tree(struct fs_context *fc)
}
static const struct fs_context_operations gfs2_meta_context_ops = {
+ .free = gfs2_fc_free,
.get_tree = gfs2_meta_get_tree,
};
diff --git a/fs/io-wq.c b/fs/io-wq.c
new file mode 100644
index 000000000000..9174007ce107
--- /dev/null
+++ b/fs/io-wq.c
@@ -0,0 +1,1065 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic worker thread pool for io_uring
+ *
+ * Copyright (C) 2019 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/sched/signal.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/rculist_nulls.h>
+
+#include "io-wq.h"
+
+#define WORKER_IDLE_TIMEOUT (5 * HZ)
+
+enum {
+ IO_WORKER_F_UP = 1, /* up and active */
+ IO_WORKER_F_RUNNING = 2, /* account as running */
+ IO_WORKER_F_FREE = 4, /* worker on free list */
+ IO_WORKER_F_EXITING = 8, /* worker exiting */
+ IO_WORKER_F_FIXED = 16, /* static idle worker */
+ IO_WORKER_F_BOUND = 32, /* is doing bounded work */
+};
+
+enum {
+ IO_WQ_BIT_EXIT = 0, /* wq exiting */
+ IO_WQ_BIT_CANCEL = 1, /* cancel work on list */
+};
+
+enum {
+ IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
+};
+
+/*
+ * One for each thread in a wqe pool
+ */
+struct io_worker {
+ refcount_t ref;
+ unsigned flags;
+ struct hlist_nulls_node nulls_node;
+ struct list_head all_list;
+ struct task_struct *task;
+ wait_queue_head_t wait;
+ struct io_wqe *wqe;
+
+ struct io_wq_work *cur_work;
+ spinlock_t lock;
+
+ struct rcu_head rcu;
+ struct mm_struct *mm;
+ struct files_struct *restore_files;
+};
+
+#if BITS_PER_LONG == 64
+#define IO_WQ_HASH_ORDER 6
+#else
+#define IO_WQ_HASH_ORDER 5
+#endif
+
+struct io_wqe_acct {
+ unsigned nr_workers;
+ unsigned max_workers;
+ atomic_t nr_running;
+};
+
+enum {
+ IO_WQ_ACCT_BOUND,
+ IO_WQ_ACCT_UNBOUND,
+};
+
+/*
+ * Per-node worker thread pool
+ */
+struct io_wqe {
+ struct {
+ spinlock_t lock;
+ struct list_head work_list;
+ unsigned long hash_map;
+ unsigned flags;
+ } ____cacheline_aligned_in_smp;
+
+ int node;
+ struct io_wqe_acct acct[2];
+
+ struct hlist_nulls_head free_list;
+ struct hlist_nulls_head busy_list;
+ struct list_head all_list;
+
+ struct io_wq *wq;
+};
+
+/*
+ * Per io_wq state
+ */
+struct io_wq {
+ struct io_wqe **wqes;
+ unsigned long state;
+ unsigned nr_wqes;
+
+ get_work_fn *get_work;
+ put_work_fn *put_work;
+
+ struct task_struct *manager;
+ struct user_struct *user;
+ struct mm_struct *mm;
+ refcount_t refs;
+ struct completion done;
+};
+
+static bool io_worker_get(struct io_worker *worker)
+{
+ return refcount_inc_not_zero(&worker->ref);
+}
+
+static void io_worker_release(struct io_worker *worker)
+{
+ if (refcount_dec_and_test(&worker->ref))
+ wake_up_process(worker->task);
+}
+
+/*
+ * Note: drops the wqe->lock if returning true! The caller must re-acquire
+ * the lock in that case. Some callers need to restart handling if this
+ * happens, so we can't just re-acquire the lock on behalf of the caller.
+ */
+static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
+{
+ bool dropped_lock = false;
+
+ if (current->files != worker->restore_files) {
+ __acquire(&wqe->lock);
+ spin_unlock_irq(&wqe->lock);
+ dropped_lock = true;
+
+ task_lock(current);
+ current->files = worker->restore_files;
+ task_unlock(current);
+ }
+
+ /*
+ * If we have an active mm, we need to drop the wq lock before unusing
+ * it. If we do, return true and let the caller retry the idle loop.
+ */
+ if (worker->mm) {
+ if (!dropped_lock) {
+ __acquire(&wqe->lock);
+ spin_unlock_irq(&wqe->lock);
+ dropped_lock = true;
+ }
+ __set_current_state(TASK_RUNNING);
+ set_fs(KERNEL_DS);
+ unuse_mm(worker->mm);
+ mmput(worker->mm);
+ worker->mm = NULL;
+ }
+
+ return dropped_lock;
+}
+
+static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
+ struct io_wq_work *work)
+{
+ if (work->flags & IO_WQ_WORK_UNBOUND)
+ return &wqe->acct[IO_WQ_ACCT_UNBOUND];
+
+ return &wqe->acct[IO_WQ_ACCT_BOUND];
+}
+
+static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
+ struct io_worker *worker)
+{
+ if (worker->flags & IO_WORKER_F_BOUND)
+ return &wqe->acct[IO_WQ_ACCT_BOUND];
+
+ return &wqe->acct[IO_WQ_ACCT_UNBOUND];
+}
+
+static void io_worker_exit(struct io_worker *worker)
+{
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ unsigned nr_workers;
+
+ /*
+ * If we're not at zero, someone else is holding a brief reference
+ * to the worker. Wait for that to go away.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!refcount_dec_and_test(&worker->ref))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ preempt_disable();
+ current->flags &= ~PF_IO_WORKER;
+ if (worker->flags & IO_WORKER_F_RUNNING)
+ atomic_dec(&acct->nr_running);
+ if (!(worker->flags & IO_WORKER_F_BOUND))
+ atomic_dec(&wqe->wq->user->processes);
+ worker->flags = 0;
+ preempt_enable();
+
+ spin_lock_irq(&wqe->lock);
+ hlist_nulls_del_rcu(&worker->nulls_node);
+ list_del_rcu(&worker->all_list);
+ if (__io_worker_unuse(wqe, worker)) {
+ __release(&wqe->lock);
+ spin_lock_irq(&wqe->lock);
+ }
+ acct->nr_workers--;
+ nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
+ wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
+ spin_unlock_irq(&wqe->lock);
+
+ /* all workers gone, wq exit can proceed */
+ if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
+ complete(&wqe->wq->done);
+
+ kfree_rcu(worker, rcu);
+}
+
+static inline bool io_wqe_run_queue(struct io_wqe *wqe)
+ __must_hold(wqe->lock)
+{
+ if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED))
+ return true;
+ return false;
+}
+
+/*
+ * Check head of free list for an available worker. If one isn't available,
+ * caller must wake up the wq manager to create one.
+ */
+static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
+ __must_hold(RCU)
+{
+ struct hlist_nulls_node *n;
+ struct io_worker *worker;
+
+ n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
+ if (is_a_nulls(n))
+ return false;
+
+ worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
+ if (io_worker_get(worker)) {
+ wake_up(&worker->wait);
+ io_worker_release(worker);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We need a worker. If we find a free one, we're good. If not, and we're
+ * below the max number of workers, wake up the manager to create one.
+ */
+static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+{
+ bool ret;
+
+ /*
+ * Most likely an attempt to queue unbounded work on an io_wq that
+ * wasn't setup with any unbounded workers.
+ */
+ WARN_ON_ONCE(!acct->max_workers);
+
+ rcu_read_lock();
+ ret = io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
+ if (!ret && acct->nr_workers < acct->max_workers)
+ wake_up_process(wqe->wq->manager);
+}
+
+static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+
+ atomic_inc(&acct->nr_running);
+}
+
+static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
+ __must_hold(wqe->lock)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+
+ if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
+ io_wqe_wake_worker(wqe, acct);
+}
+
+static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
+{
+ allow_kernel_signal(SIGINT);
+
+ current->flags |= PF_IO_WORKER;
+
+ worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+ worker->restore_files = current->files;
+ io_wqe_inc_running(wqe, worker);
+}
+
+/*
+ * Worker will start processing some work. Move it to the busy list, if
+ * it's currently on the freelist
+ */
+static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
+ struct io_wq_work *work)
+ __must_hold(wqe->lock)
+{
+ bool worker_bound, work_bound;
+
+ if (worker->flags & IO_WORKER_F_FREE) {
+ worker->flags &= ~IO_WORKER_F_FREE;
+ hlist_nulls_del_init_rcu(&worker->nulls_node);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list);
+ }
+
+ /*
+ * If worker is moving from bound to unbound (or vice versa), then
+ * ensure we update the running accounting.
+ */
+ worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
+ work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
+ if (worker_bound != work_bound) {
+ io_wqe_dec_running(wqe, worker);
+ if (work_bound) {
+ worker->flags |= IO_WORKER_F_BOUND;
+ wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
+ wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
+ atomic_dec(&wqe->wq->user->processes);
+ } else {
+ worker->flags &= ~IO_WORKER_F_BOUND;
+ wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
+ wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
+ atomic_inc(&wqe->wq->user->processes);
+ }
+ io_wqe_inc_running(wqe, worker);
+ }
+}
+
+/*
+ * No work, worker going to sleep. Move to freelist, and unuse mm if we
+ * have one attached. Dropping the mm may potentially sleep, so we drop
+ * the lock in that case and return success. Since the caller has to
+ * retry the loop in that case (we changed task state), we don't regrab
+ * the lock if we return success.
+ */
+static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
+ __must_hold(wqe->lock)
+{
+ if (!(worker->flags & IO_WORKER_F_FREE)) {
+ worker->flags |= IO_WORKER_F_FREE;
+ hlist_nulls_del_init_rcu(&worker->nulls_node);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+ }
+
+ return __io_worker_unuse(wqe, worker);
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
+ __must_hold(wqe->lock)
+{
+ struct io_wq_work *work;
+
+ list_for_each_entry(work, &wqe->work_list, list) {
+ /* not hashed, can run anytime */
+ if (!(work->flags & IO_WQ_WORK_HASHED)) {
+ list_del(&work->list);
+ return work;
+ }
+
+ /* hashed, can run if not already running */
+ *hash = work->flags >> IO_WQ_HASH_SHIFT;
+ if (!(wqe->hash_map & BIT_ULL(*hash))) {
+ wqe->hash_map |= BIT_ULL(*hash);
+ list_del(&work->list);
+ return work;
+ }
+ }
+
+ return NULL;
+}
+
+static void io_worker_handle_work(struct io_worker *worker)
+ __releases(wqe->lock)
+{
+ struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+
+ do {
+ unsigned hash = -1U;
+
+ /*
+ * If we got some work, mark us as busy. If we didn't, but
+ * the list isn't empty, it means we stalled on hashed work.
+ * Mark us stalled so we don't keep looking for work when we
+ * can't make progress, any work completion or insertion will
+ * clear the stalled flag.
+ */
+ work = io_get_next_work(wqe, &hash);
+ if (work)
+ __io_worker_busy(wqe, worker, work);
+ else if (!list_empty(&wqe->work_list))
+ wqe->flags |= IO_WQE_FLAG_STALLED;
+
+ spin_unlock_irq(&wqe->lock);
+ if (put_work && wq->put_work)
+ wq->put_work(old_work);
+ if (!work)
+ break;
+next:
+ /* flush any pending signals before assigning new work */
+ if (signal_pending(current))
+ flush_signals(current);
+
+ spin_lock_irq(&worker->lock);
+ worker->cur_work = work;
+ spin_unlock_irq(&worker->lock);
+
+ if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
+ current->files != work->files) {
+ task_lock(current);
+ current->files = work->files;
+ task_unlock(current);
+ }
+ if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
+ wq->mm && mmget_not_zero(wq->mm)) {
+ use_mm(wq->mm);
+ set_fs(USER_DS);
+ worker->mm = wq->mm;
+ }
+ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
+ work->flags |= IO_WQ_WORK_CANCEL;
+ if (worker->mm)
+ work->flags |= IO_WQ_WORK_HAS_MM;
+
+ if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) {
+ put_work = work;
+ wq->get_work(work);
+ }
+
+ old_work = work;
+ work->func(&work);
+
+ spin_lock_irq(&worker->lock);
+ worker->cur_work = NULL;
+ spin_unlock_irq(&worker->lock);
+
+ spin_lock_irq(&wqe->lock);
+
+ if (hash != -1U) {
+ wqe->hash_map &= ~BIT_ULL(hash);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ }
+ if (work && work != old_work) {
+ spin_unlock_irq(&wqe->lock);
+
+ if (put_work && wq->put_work) {
+ wq->put_work(put_work);
+ put_work = NULL;
+ }
+
+ /* dependent work not hashed */
+ hash = -1U;
+ goto next;
+ }
+ } while (1);
+}
+
+static int io_wqe_worker(void *data)
+{
+ struct io_worker *worker = data;
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+ DEFINE_WAIT(wait);
+
+ io_worker_start(wqe, worker);
+
+ while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+ prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE);
+
+ spin_lock_irq(&wqe->lock);
+ if (io_wqe_run_queue(wqe)) {
+ __set_current_state(TASK_RUNNING);
+ io_worker_handle_work(worker);
+ continue;
+ }
+ /* drops the lock on success, retry */
+ if (__io_worker_idle(wqe, worker)) {
+ __release(&wqe->lock);
+ continue;
+ }
+ spin_unlock_irq(&wqe->lock);
+ if (signal_pending(current))
+ flush_signals(current);
+ if (schedule_timeout(WORKER_IDLE_TIMEOUT))
+ continue;
+ /* timed out, exit unless we're the fixed worker */
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
+ !(worker->flags & IO_WORKER_F_FIXED))
+ break;
+ }
+
+ finish_wait(&worker->wait, &wait);
+
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+ spin_lock_irq(&wqe->lock);
+ if (!list_empty(&wqe->work_list))
+ io_worker_handle_work(worker);
+ else
+ spin_unlock_irq(&wqe->lock);
+ }
+
+ io_worker_exit(worker);
+ return 0;
+}
+
+/*
+ * Called when a worker is scheduled in. Mark us as currently running.
+ */
+void io_wq_worker_running(struct task_struct *tsk)
+{
+ struct io_worker *worker = kthread_data(tsk);
+ struct io_wqe *wqe = worker->wqe;
+
+ if (!(worker->flags & IO_WORKER_F_UP))
+ return;
+ if (worker->flags & IO_WORKER_F_RUNNING)
+ return;
+ worker->flags |= IO_WORKER_F_RUNNING;
+ io_wqe_inc_running(wqe, worker);
+}
+
+/*
+ * Called when worker is going to sleep. If there are no workers currently
+ * running and we have work pending, wake up a free one or have the manager
+ * set one up.
+ */
+void io_wq_worker_sleeping(struct task_struct *tsk)
+{
+ struct io_worker *worker = kthread_data(tsk);
+ struct io_wqe *wqe = worker->wqe;
+
+ if (!(worker->flags & IO_WORKER_F_UP))
+ return;
+ if (!(worker->flags & IO_WORKER_F_RUNNING))
+ return;
+
+ worker->flags &= ~IO_WORKER_F_RUNNING;
+
+ spin_lock_irq(&wqe->lock);
+ io_wqe_dec_running(wqe, worker);
+ spin_unlock_irq(&wqe->lock);
+}
+
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+{
+ struct io_wqe_acct *acct =&wqe->acct[index];
+ struct io_worker *worker;
+
+ worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node);
+ if (!worker)
+ return;
+
+ refcount_set(&worker->ref, 1);
+ worker->nulls_node.pprev = NULL;
+ init_waitqueue_head(&worker->wait);
+ worker->wqe = wqe;
+ spin_lock_init(&worker->lock);
+
+ worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
+ "io_wqe_worker-%d/%d", index, wqe->node);
+ if (IS_ERR(worker->task)) {
+ kfree(worker);
+ return;
+ }
+
+ spin_lock_irq(&wqe->lock);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+ list_add_tail_rcu(&worker->all_list, &wqe->all_list);
+ worker->flags |= IO_WORKER_F_FREE;
+ if (index == IO_WQ_ACCT_BOUND)
+ worker->flags |= IO_WORKER_F_BOUND;
+ if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
+ worker->flags |= IO_WORKER_F_FIXED;
+ acct->nr_workers++;
+ spin_unlock_irq(&wqe->lock);
+
+ if (index == IO_WQ_ACCT_UNBOUND)
+ atomic_inc(&wq->user->processes);
+
+ wake_up_process(worker->task);
+}
+
+static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
+ __must_hold(wqe->lock)
+{
+ struct io_wqe_acct *acct = &wqe->acct[index];
+
+ /* always ensure we have one bounded worker */
+ if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers)
+ return true;
+ /* if we have available workers or no work, no need */
+ if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
+ return false;
+ return acct->nr_workers < acct->max_workers;
+}
+
+/*
+ * Manager thread. Tasked with creating new workers, if we need them.
+ */
+static int io_wq_manager(void *data)
+{
+ struct io_wq *wq = data;
+
+ while (!kthread_should_stop()) {
+ int i;
+
+ for (i = 0; i < wq->nr_wqes; i++) {
+ struct io_wqe *wqe = wq->wqes[i];
+ bool fork_worker[2] = { false, false };
+
+ spin_lock_irq(&wqe->lock);
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
+ fork_worker[IO_WQ_ACCT_BOUND] = true;
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
+ fork_worker[IO_WQ_ACCT_UNBOUND] = true;
+ spin_unlock_irq(&wqe->lock);
+ if (fork_worker[IO_WQ_ACCT_BOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
+ if (fork_worker[IO_WQ_ACCT_UNBOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ }
+
+ return 0;
+}
+
+static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
+ struct io_wq_work *work)
+{
+ bool free_worker;
+
+ if (!(work->flags & IO_WQ_WORK_UNBOUND))
+ return true;
+ if (atomic_read(&acct->nr_running))
+ return true;
+
+ rcu_read_lock();
+ free_worker = !hlist_nulls_empty(&wqe->free_list);
+ rcu_read_unlock();
+ if (free_worker)
+ return true;
+
+ if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
+ !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
+ return false;
+
+ return true;
+}
+
+static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
+{
+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ unsigned long flags;
+
+ /*
+ * Do early check to see if we need a new unbound worker, and if we do,
+ * if we're allowed to do so. This isn't 100% accurate as there's a
+ * gap between this check and incrementing the value, but that's OK.
+ * It's close enough to not be an issue, fork() has the same delay.
+ */
+ if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ return;
+ }
+
+ spin_lock_irqsave(&wqe->lock, flags);
+ list_add_tail(&work->list, &wqe->work_list);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ spin_unlock_irqrestore(&wqe->lock, flags);
+
+ if (!atomic_read(&acct->nr_running))
+ io_wqe_wake_worker(wqe, acct);
+}
+
+void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
+{
+ struct io_wqe *wqe = wq->wqes[numa_node_id()];
+
+ io_wqe_enqueue(wqe, work);
+}
+
+/*
+ * Enqueue work, hashed by some key. Work items that hash to the same value
+ * will not be done in parallel. Used to limit concurrent writes, generally
+ * hashed by inode.
+ */
+void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
+{
+ struct io_wqe *wqe = wq->wqes[numa_node_id()];
+ unsigned bit;
+
+
+ bit = hash_ptr(val, IO_WQ_HASH_ORDER);
+ work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
+ io_wqe_enqueue(wqe, work);
+}
+
+static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
+{
+ send_sig(SIGINT, worker->task, 1);
+ return false;
+}
+
+/*
+ * Iterate the passed in list and call the specific function for each
+ * worker that isn't exiting
+ */
+static bool io_wq_for_each_worker(struct io_wqe *wqe,
+ bool (*func)(struct io_worker *, void *),
+ void *data)
+{
+ struct io_worker *worker;
+ bool ret = false;
+
+ list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
+ if (io_worker_get(worker)) {
+ ret = func(worker, data);
+ io_worker_release(worker);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+void io_wq_cancel_all(struct io_wq *wq)
+{
+ int i;
+
+ set_bit(IO_WQ_BIT_CANCEL, &wq->state);
+
+ /*
+ * Browse both lists, as there's a gap between handing work off
+ * to a worker and the worker putting itself on the busy_list
+ */
+ rcu_read_lock();
+ for (i = 0; i < wq->nr_wqes; i++) {
+ struct io_wqe *wqe = wq->wqes[i];
+
+ io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
+ }
+ rcu_read_unlock();
+}
+
+struct io_cb_cancel_data {
+ struct io_wqe *wqe;
+ work_cancel_fn *cancel;
+ void *caller_data;
+};
+
+static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
+{
+ struct io_cb_cancel_data *data = cancel_data;
+ unsigned long flags;
+ bool ret = false;
+
+ /*
+ * Hold the lock to avoid ->cur_work going out of scope, caller
+ * may dereference the passed in work.
+ */
+ spin_lock_irqsave(&worker->lock, flags);
+ if (worker->cur_work &&
+ data->cancel(worker->cur_work, data->caller_data)) {
+ send_sig(SIGINT, worker->task, 1);
+ ret = true;
+ }
+ spin_unlock_irqrestore(&worker->lock, flags);
+
+ return ret;
+}
+
+static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
+ work_cancel_fn *cancel,
+ void *cancel_data)
+{
+ struct io_cb_cancel_data data = {
+ .wqe = wqe,
+ .cancel = cancel,
+ .caller_data = cancel_data,
+ };
+ struct io_wq_work *work;
+ unsigned long flags;
+ bool found = false;
+
+ spin_lock_irqsave(&wqe->lock, flags);
+ list_for_each_entry(work, &wqe->work_list, list) {
+ if (cancel(work, cancel_data)) {
+ list_del(&work->list);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&wqe->lock, flags);
+
+ if (found) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ return IO_WQ_CANCEL_OK;
+ }
+
+ rcu_read_lock();
+ found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
+ rcu_read_unlock();
+ return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
+}
+
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+ void *data)
+{
+ enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+ int i;
+
+ for (i = 0; i < wq->nr_wqes; i++) {
+ struct io_wqe *wqe = wq->wqes[i];
+
+ ret = io_wqe_cancel_cb_work(wqe, cancel, data);
+ if (ret != IO_WQ_CANCEL_NOTFOUND)
+ break;
+ }
+
+ return ret;
+}
+
+static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
+{
+ struct io_wq_work *work = data;
+ unsigned long flags;
+ bool ret = false;
+
+ if (worker->cur_work != work)
+ return false;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ if (worker->cur_work == work) {
+ send_sig(SIGINT, worker->task, 1);
+ ret = true;
+ }
+ spin_unlock_irqrestore(&worker->lock, flags);
+
+ return ret;
+}
+
+static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
+ struct io_wq_work *cwork)
+{
+ struct io_wq_work *work;
+ unsigned long flags;
+ bool found = false;
+
+ cwork->flags |= IO_WQ_WORK_CANCEL;
+
+ /*
+ * First check pending list, if we're lucky we can just remove it
+ * from there. CANCEL_OK means that the work is returned as-new,
+ * no completion will be posted for it.
+ */
+ spin_lock_irqsave(&wqe->lock, flags);
+ list_for_each_entry(work, &wqe->work_list, list) {
+ if (work == cwork) {
+ list_del(&work->list);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&wqe->lock, flags);
+
+ if (found) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ return IO_WQ_CANCEL_OK;
+ }
+
+ /*
+ * Now check if a free (going busy) or busy worker has the work
+ * currently running. If we find it there, we'll return CANCEL_RUNNING
+ * as an indication that we attempte to signal cancellation. The
+ * completion will run normally in this case.
+ */
+ rcu_read_lock();
+ found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork);
+ rcu_read_unlock();
+ return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
+}
+
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+{
+ enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+ int i;
+
+ for (i = 0; i < wq->nr_wqes; i++) {
+ struct io_wqe *wqe = wq->wqes[i];
+
+ ret = io_wqe_cancel_work(wqe, cwork);
+ if (ret != IO_WQ_CANCEL_NOTFOUND)
+ break;
+ }
+
+ return ret;
+}
+
+struct io_wq_flush_data {
+ struct io_wq_work work;
+ struct completion done;
+};
+
+static void io_wq_flush_func(struct io_wq_work **workptr)
+{
+ struct io_wq_work *work = *workptr;
+ struct io_wq_flush_data *data;
+
+ data = container_of(work, struct io_wq_flush_data, work);
+ complete(&data->done);
+}
+
+/*
+ * Doesn't wait for previously queued work to finish. When this completes,
+ * it just means that previously queued work was started.
+ */
+void io_wq_flush(struct io_wq *wq)
+{
+ struct io_wq_flush_data data;
+ int i;
+
+ for (i = 0; i < wq->nr_wqes; i++) {
+ struct io_wqe *wqe = wq->wqes[i];
+
+ init_completion(&data.done);
+ INIT_IO_WORK(&data.work, io_wq_flush_func);
+ data.work.flags |= IO_WQ_WORK_INTERNAL;
+ io_wqe_enqueue(wqe, &data.work);
+ wait_for_completion(&data.done);
+ }
+}
+
+struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
+ struct user_struct *user, get_work_fn *get_work,
+ put_work_fn *put_work)
+{
+ int ret = -ENOMEM, i, node;
+ struct io_wq *wq;
+
+ wq = kcalloc(1, sizeof(*wq), GFP_KERNEL);
+ if (!wq)
+ return ERR_PTR(-ENOMEM);
+
+ wq->nr_wqes = num_online_nodes();
+ wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL);
+ if (!wq->wqes) {
+ kfree(wq);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ wq->get_work = get_work;
+ wq->put_work = put_work;
+
+ /* caller must already hold a reference to this */
+ wq->user = user;
+
+ i = 0;
+ refcount_set(&wq->refs, wq->nr_wqes);
+ for_each_online_node(node) {
+ struct io_wqe *wqe;
+
+ wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node);
+ if (!wqe)
+ break;
+ wq->wqes[i] = wqe;
+ wqe->node = node;
+ wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
+ atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
+ if (user) {
+ wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+ task_rlimit(current, RLIMIT_NPROC);
+ }
+ atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
+ wqe->node = node;
+ wqe->wq = wq;
+ spin_lock_init(&wqe->lock);
+ INIT_LIST_HEAD(&wqe->work_list);
+ INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
+ INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
+ INIT_LIST_HEAD(&wqe->all_list);
+
+ i++;
+ }
+
+ init_completion(&wq->done);
+
+ if (i != wq->nr_wqes)
+ goto err;
+
+ /* caller must have already done mmgrab() on this mm */
+ wq->mm = mm;
+
+ wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
+ if (!IS_ERR(wq->manager)) {
+ wake_up_process(wq->manager);
+ return wq;
+ }
+
+ ret = PTR_ERR(wq->manager);
+ wq->manager = NULL;
+err:
+ complete(&wq->done);
+ io_wq_destroy(wq);
+ return ERR_PTR(ret);
+}
+
+static bool io_wq_worker_wake(struct io_worker *worker, void *data)
+{
+ wake_up_process(worker->task);
+ return false;
+}
+
+void io_wq_destroy(struct io_wq *wq)
+{
+ int i;
+
+ if (wq->manager) {
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ kthread_stop(wq->manager);
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < wq->nr_wqes; i++) {
+ struct io_wqe *wqe = wq->wqes[i];
+
+ if (!wqe)
+ continue;
+ io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
+ }
+ rcu_read_unlock();
+
+ wait_for_completion(&wq->done);
+
+ for (i = 0; i < wq->nr_wqes; i++)
+ kfree(wq->wqes[i]);
+ kfree(wq->wqes);
+ kfree(wq);
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
new file mode 100644
index 000000000000..4b29f922f80c
--- /dev/null
+++ b/fs/io-wq.h
@@ -0,0 +1,74 @@
+#ifndef INTERNAL_IO_WQ_H
+#define INTERNAL_IO_WQ_H
+
+struct io_wq;
+
+enum {
+ IO_WQ_WORK_CANCEL = 1,
+ IO_WQ_WORK_HAS_MM = 2,
+ IO_WQ_WORK_HASHED = 4,
+ IO_WQ_WORK_NEEDS_USER = 8,
+ IO_WQ_WORK_NEEDS_FILES = 16,
+ IO_WQ_WORK_UNBOUND = 32,
+ IO_WQ_WORK_INTERNAL = 64,
+
+ IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
+};
+
+enum io_wq_cancel {
+ IO_WQ_CANCEL_OK, /* cancelled before started */
+ IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
+ IO_WQ_CANCEL_NOTFOUND, /* work not found */
+};
+
+struct io_wq_work {
+ struct list_head list;
+ void (*func)(struct io_wq_work **);
+ unsigned flags;
+ struct files_struct *files;
+};
+
+#define INIT_IO_WORK(work, _func) \
+ do { \
+ (work)->func = _func; \
+ (work)->flags = 0; \
+ (work)->files = NULL; \
+ } while (0) \
+
+typedef void (get_work_fn)(struct io_wq_work *);
+typedef void (put_work_fn)(struct io_wq_work *);
+
+struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
+ struct user_struct *user,
+ get_work_fn *get_work, put_work_fn *put_work);
+void io_wq_destroy(struct io_wq *wq);
+
+void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
+void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
+void io_wq_flush(struct io_wq *wq);
+
+void io_wq_cancel_all(struct io_wq *wq);
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
+
+typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
+
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+ void *data);
+
+#if defined(CONFIG_IO_WQ)
+extern void io_wq_worker_sleeping(struct task_struct *);
+extern void io_wq_worker_running(struct task_struct *);
+#else
+static inline void io_wq_worker_sleeping(struct task_struct *tsk)
+{
+}
+static inline void io_wq_worker_running(struct task_struct *tsk)
+{
+}
+#endif
+
+static inline bool io_wq_current_is_worker(void)
+{
+ return in_task() && (current->flags & PF_IO_WORKER);
+}
+#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8a0381f1a43b..4c030a92de79 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -56,7 +56,6 @@
#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
@@ -71,12 +70,24 @@
#include <linux/sizes.h>
#include <linux/hugetlb.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/io_uring.h>
+
#include <uapi/linux/io_uring.h>
#include "internal.h"
+#include "io-wq.h"
#define IORING_MAX_ENTRIES 32768
-#define IORING_MAX_FIXED_FILES 1024
+#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
+
+/*
+ * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
+ */
+#define IORING_FILE_TABLE_SHIFT 9
+#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
+#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
+#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
@@ -161,14 +172,8 @@ struct io_mapped_ubuf {
unsigned int nr_bvecs;
};
-struct async_list {
- spinlock_t lock;
- atomic_t cnt;
- struct list_head list;
-
- struct file *file;
- off_t io_start;
- size_t io_len;
+struct fixed_file_table {
+ struct file **files;
};
struct io_ring_ctx {
@@ -180,6 +185,7 @@ struct io_ring_ctx {
unsigned int flags;
bool compat;
bool account_mem;
+ bool cq_overflow_flushed;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -197,37 +203,31 @@ struct io_ring_ctx {
unsigned sq_entries;
unsigned sq_mask;
unsigned sq_thread_idle;
+ unsigned cached_sq_dropped;
+ atomic_t cached_cq_overflow;
struct io_uring_sqe *sq_sqes;
struct list_head defer_list;
struct list_head timeout_list;
+ struct list_head cq_overflow_list;
+
+ wait_queue_head_t inflight_wait;
} ____cacheline_aligned_in_smp;
+ struct io_rings *rings;
+
/* IO offload */
- struct workqueue_struct *sqo_wq[2];
+ struct io_wq *io_wq;
struct task_struct *sqo_thread; /* if using sq thread polling */
struct mm_struct *sqo_mm;
wait_queue_head_t sqo_wait;
- struct completion sqo_thread_started;
-
- struct {
- unsigned cached_cq_tail;
- unsigned cq_entries;
- unsigned cq_mask;
- struct wait_queue_head cq_wait;
- struct fasync_struct *cq_fasync;
- struct eventfd_ctx *cq_ev_fd;
- atomic_t cq_timeouts;
- } ____cacheline_aligned_in_smp;
-
- struct io_rings *rings;
/*
* If used, fixed file set. Writers must ensure that ->refs is dead,
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct file **user_files;
+ struct fixed_file_table *file_table;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
@@ -236,7 +236,25 @@ struct io_ring_ctx {
struct user_struct *user;
- struct completion ctx_done;
+ /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
+ struct completion *completions;
+
+ /* if all else fails... */
+ struct io_kiocb *fallback_req;
+
+#if defined(CONFIG_UNIX)
+ struct socket *ring_sock;
+#endif
+
+ struct {
+ unsigned cached_cq_tail;
+ unsigned cq_entries;
+ unsigned cq_mask;
+ atomic_t cq_timeouts;
+ struct wait_queue_head cq_wait;
+ struct fasync_struct *cq_fasync;
+ struct eventfd_ctx *cq_ev_fd;
+ } ____cacheline_aligned_in_smp;
struct {
struct mutex uring_lock;
@@ -253,22 +271,20 @@ struct io_ring_ctx {
* manipulate the list, hence no extra locking is needed there.
*/
struct list_head poll_list;
- struct list_head cancel_list;
- } ____cacheline_aligned_in_smp;
+ struct rb_root cancel_tree;
- struct async_list pending_async[2];
-
-#if defined(CONFIG_UNIX)
- struct socket *ring_sock;
-#endif
+ spinlock_t inflight_lock;
+ struct list_head inflight_list;
+ } ____cacheline_aligned_in_smp;
};
struct sqe_submit {
const struct io_uring_sqe *sqe;
- unsigned short index;
+ struct file *ring_file;
+ int ring_fd;
u32 sequence;
bool has_user;
- bool needs_lock;
+ bool in_async;
bool needs_fixed_file;
};
@@ -307,7 +323,10 @@ struct io_kiocb {
struct sqe_submit submit;
struct io_ring_ctx *ctx;
- struct list_head list;
+ union {
+ struct list_head list;
+ struct rb_node rb_node;
+ };
struct list_head link_list;
unsigned int flags;
refcount_t refs;
@@ -318,15 +337,22 @@ struct io_kiocb {
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
-#define REQ_F_LINK_DONE 128 /* linked sqes done */
+#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
#define REQ_F_TIMEOUT 1024 /* timeout request */
+#define REQ_F_ISREG 2048 /* regular file */
+#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
+#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
+#define REQ_F_INFLIGHT 16384 /* on inflight list */
+#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
u64 user_data;
u32 result;
u32 sequence;
- struct work_struct work;
+ struct list_head inflight_entry;
+
+ struct io_wq_work work;
};
#define IO_PLUG_THRESHOLD 2
@@ -352,10 +378,11 @@ struct io_submit_state {
unsigned int ios_left;
};
-static void io_sq_wq_submit_work(struct work_struct *work);
-static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
- long res);
+static void io_wq_submit_work(struct io_wq_work **workptr);
+static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void __io_free_req(struct io_kiocb *req);
+static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -378,64 +405,75 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
- complete(&ctx->ctx_done);
+ complete(&ctx->completions[0]);
}
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int i;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
+ if (!ctx->fallback_req)
+ goto err;
+
+ ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
+ if (!ctx->completions)
+ goto err;
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
- kfree(ctx);
- return NULL;
- }
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ goto err;
ctx->flags = p->flags;
init_waitqueue_head(&ctx->cq_wait);
- init_completion(&ctx->ctx_done);
- init_completion(&ctx->sqo_thread_started);
+ INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ init_completion(&ctx->completions[0]);
+ init_completion(&ctx->completions[1]);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
- for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
- spin_lock_init(&ctx->pending_async[i].lock);
- INIT_LIST_HEAD(&ctx->pending_async[i].list);
- atomic_set(&ctx->pending_async[i].cnt, 0);
- }
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->poll_list);
- INIT_LIST_HEAD(&ctx->cancel_list);
+ ctx->cancel_tree = RB_ROOT;
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
+ init_waitqueue_head(&ctx->inflight_wait);
+ spin_lock_init(&ctx->inflight_lock);
+ INIT_LIST_HEAD(&ctx->inflight_list);
return ctx;
+err:
+ if (ctx->fallback_req)
+ kmem_cache_free(req_cachep, ctx->fallback_req);
+ kfree(ctx->completions);
+ kfree(ctx);
+ return NULL;
}
-static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static inline bool __req_need_defer(struct io_kiocb *req)
{
- /* timeout requests always honor sequence */
- if (!(req->flags & REQ_F_TIMEOUT) &&
- (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
- return false;
+ struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped;
+ return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
+ + atomic_read(&ctx->cached_cq_overflow);
}
-static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx,
- struct list_head *list)
+static inline bool req_need_defer(struct io_kiocb *req)
{
- struct io_kiocb *req;
+ if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
+ return __req_need_defer(req);
- if (list_empty(list))
- return NULL;
+ return false;
+}
+
+static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
+{
+ struct io_kiocb *req;
- req = list_first_entry(list, struct io_kiocb, list);
- if (!io_sequence_defer(ctx, req)) {
+ req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
+ if (req && !req_need_defer(req)) {
list_del_init(&req->list);
return req;
}
@@ -443,14 +481,21 @@ static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx,
return NULL;
}
-static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
-{
- return __io_get_deferred_req(ctx, &ctx->defer_list);
-}
-
static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
{
- return __io_get_deferred_req(ctx, &ctx->timeout_list);
+ struct io_kiocb *req;
+
+ req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
+ if (req) {
+ if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ return NULL;
+ if (!__req_need_defer(req)) {
+ list_del_init(&req->list);
+ return req;
+ }
+ }
+
+ return NULL;
}
static void __io_commit_cqring(struct io_ring_ctx *ctx)
@@ -468,21 +513,59 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline void io_queue_async_work(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
{
- int rw = 0;
+ u8 opcode = READ_ONCE(sqe->opcode);
+
+ return !(opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED);
+}
+
+static inline bool io_prep_async_work(struct io_kiocb *req)
+{
+ bool do_hashed = false;
if (req->submit.sqe) {
switch (req->submit.sqe->opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
- rw = !(req->rw.ki_flags & IOCB_DIRECT);
+ do_hashed = true;
+ /* fall-through */
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_SENDMSG:
+ case IORING_OP_RECVMSG:
+ case IORING_OP_ACCEPT:
+ case IORING_OP_POLL_ADD:
+ /*
+ * We know REQ_F_ISREG is not set on some of these
+ * opcodes, but this enables us to keep the check in
+ * just one place.
+ */
+ if (!(req->flags & REQ_F_ISREG))
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
+ if (io_sqe_needs_user(req->submit.sqe))
+ req->work.flags |= IO_WQ_WORK_NEEDS_USER;
}
- queue_work(ctx->sqo_wq[rw], &req->work);
+ return do_hashed;
+}
+
+static inline void io_queue_async_work(struct io_kiocb *req)
+{
+ bool do_hashed = io_prep_async_work(req);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
+ req->flags);
+ if (!do_hashed) {
+ io_wq_enqueue(ctx->io_wq, &req->work);
+ } else {
+ io_wq_enqueue_hashed(ctx->io_wq, &req->work,
+ file_inode(req->file));
+ }
}
static void io_kill_timeout(struct io_kiocb *req)
@@ -492,9 +575,9 @@ static void io_kill_timeout(struct io_kiocb *req)
ret = hrtimer_try_to_cancel(&req->timeout.timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
- list_del(&req->list);
- io_cqring_fill_event(req->ctx, req->user_data, 0);
- __io_free_req(req);
+ list_del_init(&req->list);
+ io_cqring_fill_event(req, 0);
+ io_put_req(req);
}
}
@@ -524,7 +607,7 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
continue;
}
req->flags |= REQ_F_IO_DRAINED;
- io_queue_async_work(ctx, req);
+ io_queue_async_work(req);
}
}
@@ -546,57 +629,122 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
return &rings->cqes[tail & ctx->cq_mask];
}
-static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
- long res)
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
+ if (ctx->cq_ev_fd)
+ eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+{
+ struct io_rings *rings = ctx->rings;
+ struct io_uring_cqe *cqe;
+ struct io_kiocb *req;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ if (!force) {
+ if (list_empty_careful(&ctx->cq_overflow_list))
+ return;
+ if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
+ rings->cq_ring_entries))
+ return;
+ }
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+
+ /* if force is set, the ring is going away. always drop after that */
+ if (force)
+ ctx->cq_overflow_flushed = true;
+
+ while (!list_empty(&ctx->cq_overflow_list)) {
+ cqe = io_get_cqring(ctx);
+ if (!cqe && !force)
+ break;
+
+ req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
+ list);
+ list_move(&req->list, &list);
+ if (cqe) {
+ WRITE_ONCE(cqe->user_data, req->user_data);
+ WRITE_ONCE(cqe->res, req->result);
+ WRITE_ONCE(cqe->flags, 0);
+ } else {
+ WRITE_ONCE(ctx->rings->cq_overflow,
+ atomic_inc_return(&ctx->cached_cq_overflow));
+ }
+ }
+
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ io_cqring_ev_posted(ctx);
+
+ while (!list_empty(&list)) {
+ req = list_first_entry(&list, struct io_kiocb, list);
+ list_del(&req->list);
+ io_put_req(req);
+ }
+}
+
+static void io_cqring_fill_event(struct io_kiocb *req, long res)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
+ trace_io_uring_complete(ctx, req->user_data, res);
+
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqring(ctx);
- if (cqe) {
- WRITE_ONCE(cqe->user_data, ki_user_data);
+ if (likely(cqe)) {
+ WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, 0);
+ } else if (ctx->cq_overflow_flushed) {
+ WRITE_ONCE(ctx->rings->cq_overflow,
+ atomic_inc_return(&ctx->cached_cq_overflow));
} else {
- unsigned overflow = READ_ONCE(ctx->rings->cq_overflow);
-
- WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1);
+ refcount_inc(&req->refs);
+ req->result = res;
+ list_add_tail(&req->list, &ctx->cq_overflow_list);
}
}
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
- if (waitqueue_active(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
- if (ctx->cq_ev_fd)
- eventfd_signal(ctx->cq_ev_fd, 1);
-}
-
-static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
- long res)
+static void io_cqring_add_event(struct io_kiocb *req, long res)
{
+ struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- io_cqring_fill_event(ctx, user_data, res);
+ io_cqring_fill_event(req, res);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
-static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
+static inline bool io_is_fallback_req(struct io_kiocb *req)
{
- percpu_ref_put_many(&ctx->refs, refs);
+ return req == (struct io_kiocb *)
+ ((unsigned long) req->ctx->fallback_req & ~1UL);
+}
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
+{
+ struct io_kiocb *req;
+
+ req = ctx->fallback_req;
+ if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
+ return req;
+
+ return NULL;
}
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
@@ -611,7 +759,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
if (!state) {
req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
- goto out;
+ goto fallback;
} else if (!state->free_reqs) {
size_t sz;
int ret;
@@ -626,7 +774,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
if (unlikely(ret <= 0)) {
state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
if (!state->reqs[0])
- goto out;
+ goto fallback;
ret = 1;
}
state->free_reqs = ret - 1;
@@ -638,15 +786,20 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
state->cur_req++;
}
+got_it:
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->result = 0;
+ INIT_IO_WORK(&req->work, io_wq_submit_work);
return req;
-out:
- io_ring_drop_ctx_refs(ctx, 1);
+fallback:
+ req = io_get_fallback_req(ctx);
+ if (req)
+ goto got_it;
+ percpu_ref_put(&ctx->refs);
return NULL;
}
@@ -654,22 +807,55 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
{
if (*nr) {
kmem_cache_free_bulk(req_cachep, *nr, reqs);
- io_ring_drop_ctx_refs(ctx, *nr);
+ percpu_ref_put_many(&ctx->refs, *nr);
*nr = 0;
}
}
static void __io_free_req(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
- io_ring_drop_ctx_refs(req->ctx, 1);
- kmem_cache_free(req_cachep, req);
+ if (req->flags & REQ_F_INFLIGHT) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ if (waitqueue_active(&ctx->inflight_wait))
+ wake_up(&ctx->inflight_wait);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ }
+ percpu_ref_put(&ctx->refs);
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
+}
+
+static bool io_link_cancel_timeout(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ if (ret != -1) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(ctx);
+ req->flags &= ~REQ_F_LINK;
+ io_put_req(req);
+ return true;
+ }
+
+ return false;
}
-static void io_req_link_next(struct io_kiocb *req)
+static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt;
+ bool wake_ev = false;
/*
* The list should never be empty when we are called here. But could
@@ -677,18 +863,35 @@ static void io_req_link_next(struct io_kiocb *req)
* safe side.
*/
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
- if (nxt) {
- list_del(&nxt->list);
+ while (nxt) {
+ list_del_init(&nxt->list);
if (!list_empty(&req->link_list)) {
INIT_LIST_HEAD(&nxt->link_list);
list_splice(&req->link_list, &nxt->link_list);
nxt->flags |= REQ_F_LINK;
}
- nxt->flags |= REQ_F_LINK_DONE;
- INIT_WORK(&nxt->work, io_sq_wq_submit_work);
- io_queue_async_work(req->ctx, nxt);
+ /*
+ * If we're in async work, we can continue processing the chain
+ * in this context instead of having to queue up new async work.
+ */
+ if (req->flags & REQ_F_LINK_TIMEOUT) {
+ wake_ev = io_link_cancel_timeout(nxt);
+
+ /* we dropped this link, get next */
+ nxt = list_first_entry_or_null(&req->link_list,
+ struct io_kiocb, list);
+ } else if (nxtptr && io_wq_current_is_worker()) {
+ *nxtptr = nxt;
+ break;
+ } else {
+ io_queue_async_work(nxt);
+ break;
+ }
}
+
+ if (wake_ev)
+ io_cqring_ev_posted(ctx);
}
/*
@@ -696,48 +899,131 @@ static void io_req_link_next(struct io_kiocb *req)
*/
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
link = list_first_entry(&req->link_list, struct io_kiocb, list);
- list_del(&link->list);
+ list_del_init(&link->list);
+
+ trace_io_uring_fail_link(req, link);
- io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
- __io_free_req(link);
+ if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+ link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+ io_link_cancel_timeout(link);
+ } else {
+ io_cqring_fill_event(link, -ECANCELED);
+ io_double_put_req(link);
+ }
}
+
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ io_cqring_ev_posted(ctx);
}
-static void io_free_req(struct io_kiocb *req)
+static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
+ if (likely(!(req->flags & REQ_F_LINK))) {
+ __io_free_req(req);
+ return;
+ }
+
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & REQ_F_LINK) {
- if (req->flags & REQ_F_FAIL_LINK)
- io_fail_links(req);
- else
- io_req_link_next(req);
+ if (req->flags & REQ_F_FAIL_LINK) {
+ io_fail_links(req);
+ } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
+ REQ_F_LINK_TIMEOUT) {
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+
+ /*
+ * If this is a timeout link, we could be racing with the
+ * timeout timer. Grab the completion lock for this case to
+ * protect against that.
+ */
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ io_req_link_next(req, nxt);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ io_req_link_next(req, nxt);
}
__io_free_req(req);
}
+static void io_free_req(struct io_kiocb *req)
+{
+ io_free_req_find_next(req, NULL);
+}
+
+/*
+ * Drop reference to request, return next in chain (if there is one) if this
+ * was the last reference to this request.
+ */
+static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+{
+ struct io_kiocb *nxt = NULL;
+
+ if (refcount_dec_and_test(&req->refs))
+ io_free_req_find_next(req, &nxt);
+
+ if (nxt) {
+ if (nxtptr)
+ *nxtptr = nxt;
+ else
+ io_queue_async_work(nxt);
+ }
+}
+
static void io_put_req(struct io_kiocb *req)
{
if (refcount_dec_and_test(&req->refs))
io_free_req(req);
}
-static unsigned io_cqring_events(struct io_rings *rings)
+static void io_double_put_req(struct io_kiocb *req)
+{
+ /* drop both submit and complete references */
+ if (refcount_sub_and_test(2, &req->refs))
+ __io_free_req(req);
+}
+
+static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{
+ struct io_rings *rings = ctx->rings;
+
+ /*
+ * noflush == true is from the waitqueue handler, just ensure we wake
+ * up the task, and the next invocation will flush the entries. We
+ * cannot safely to it from here.
+ */
+ if (noflush && !list_empty(&ctx->cq_overflow_list))
+ return -1U;
+
+ io_cqring_overflow_flush(ctx, false);
+
/* See comment at the top of this file */
smp_rmb();
return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
}
+static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
+{
+ struct io_rings *rings = ctx->rings;
+
+ /* make sure SQ entry isn't read before tail */
+ return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -753,7 +1039,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
- io_cqring_fill_event(ctx, req->user_data, req->result);
+ io_cqring_fill_event(req, req->result);
(*nr_events)++;
if (refcount_dec_and_test(&req->refs)) {
@@ -762,8 +1048,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
- if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
- REQ_F_FIXED_FILE) {
+ if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+ REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free);
@@ -867,19 +1153,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
+static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
{
- int iters, ret = 0;
+ int iters = 0, ret = 0;
- /*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
-
- iters = 0;
do {
int tmin = 0;
@@ -888,7 +1166,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (io_cqring_events(ctx->rings))
+ if (io_cqring_events(ctx, false))
break;
/*
@@ -915,42 +1193,76 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
ret = 0;
} while (min && !*nr_events && !need_resched());
+ return ret;
+}
+
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
+{
+ int ret;
+
+ /*
+ * We disallow the app entering submit/complete with polling, but we
+ * still need to lock the ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_iopoll_check(ctx, nr_events, min);
mutex_unlock(&ctx->uring_lock);
return ret;
}
-static void kiocb_end_write(struct kiocb *kiocb)
+static void kiocb_end_write(struct io_kiocb *req)
{
- if (kiocb->ki_flags & IOCB_WRITE) {
- struct inode *inode = file_inode(kiocb->ki_filp);
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (req->flags & REQ_F_ISREG) {
+ struct inode *inode = file_inode(req->file);
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
- if (S_ISREG(inode->i_mode))
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
- file_end_write(kiocb->ki_filp);
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
}
+ file_end_write(req->file);
}
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_common(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
- kiocb_end_write(kiocb);
+ if (kiocb->ki_flags & IOCB_WRITE)
+ kiocb_end_write(req);
if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK;
- io_cqring_add_event(req->ctx, req->user_data, res);
+ io_cqring_add_event(req, res);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ io_complete_rw_common(kiocb, res);
io_put_req(req);
}
+static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+ struct io_kiocb *nxt = NULL;
+
+ io_complete_rw_common(kiocb, res);
+ io_put_req_find_next(req, &nxt);
+
+ return nxt;
+}
+
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
- kiocb_end_write(kiocb);
+ if (kiocb->ki_flags & IOCB_WRITE)
+ kiocb_end_write(req);
if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK;
@@ -1052,10 +1364,9 @@ static bool io_file_supports_async(struct file *file)
return false;
}
-static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock)
+static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
{
- const struct io_uring_sqe *sqe = s->sqe;
+ const struct io_uring_sqe *sqe = req->submit.sqe;
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw;
unsigned ioprio;
@@ -1064,8 +1375,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
if (!req->file)
return -EBADF;
- if (force_nonblock && !io_file_supports_async(req->file))
- force_nonblock = false;
+ if (S_ISREG(file_inode(req->file)->i_mode))
+ req->flags |= REQ_F_ISREG;
+
+ /*
+ * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
+ * we know to async punt it even if it was opened O_NONBLOCK
+ */
+ if (force_nonblock && !io_file_supports_async(req->file)) {
+ req->flags |= REQ_F_MUST_PUNT;
+ return -EAGAIN;
+ }
kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
@@ -1086,7 +1406,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
return ret;
/* don't allow async punt if RWF_NOWAIT was requested */
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+ (req->file->f_flags & O_NONBLOCK))
req->flags |= REQ_F_NOWAIT;
if (force_nonblock)
@@ -1099,6 +1420,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
+ req->result = 0;
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -1128,6 +1450,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
+ bool in_async)
+{
+ if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw)
+ *nxt = __io_complete_rw(kiocb, ret);
+ else
+ io_rw_done(kiocb, ret);
+}
+
static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
const struct io_uring_sqe *sqe,
struct iov_iter *iter)
@@ -1199,7 +1530,7 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
}
}
- return 0;
+ return len;
}
static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
@@ -1239,65 +1570,6 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
}
-static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
-{
- if (al->file == kiocb->ki_filp) {
- off_t start, end;
-
- /*
- * Allow merging if we're anywhere in the range of the same
- * page. Generally this happens for sub-page reads or writes,
- * and it's beneficial to allow the first worker to bring the
- * page in and the piggy backed work can then work on the
- * cached page.
- */
- start = al->io_start & PAGE_MASK;
- end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
- if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
- return true;
- }
-
- al->file = NULL;
- return false;
-}
-
-/*
- * Make a note of the last file/offset/direction we punted to async
- * context. We'll use this information to see if we can piggy back a
- * sequential request onto the previous one, if it's still hasn't been
- * completed by the async worker.
- */
-static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
-{
- struct async_list *async_list = &req->ctx->pending_async[rw];
- struct kiocb *kiocb = &req->rw;
- struct file *filp = kiocb->ki_filp;
-
- if (io_should_merge(async_list, kiocb)) {
- unsigned long max_bytes;
-
- /* Use 8x RA size as a decent limiter for both reads/writes */
- max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
- if (!max_bytes)
- max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
-
- /* If max len are exceeded, reset the state */
- if (async_list->io_len + len <= max_bytes) {
- req->flags |= REQ_F_SEQ_PREV;
- async_list->io_len += len;
- } else {
- async_list->file = NULL;
- }
- }
-
- /* New file? Reset state. */
- if (async_list->file != filp) {
- async_list->io_start = kiocb->ki_pos;
- async_list->io_len = len;
- async_list->file = filp;
- }
-}
-
/*
* For files that don't have ->read_iter() and ->write_iter(), handle them
* by looping over ->read() or ->write() manually.
@@ -1343,7 +1615,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret;
}
-static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
+static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -1353,7 +1625,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
size_t iov_count;
ssize_t read_size, ret;
- ret = io_prep_rw(req, s, force_nonblock);
+ ret = io_prep_rw(req, force_nonblock);
if (ret)
return ret;
file = kiocb->ki_filp;
@@ -1361,7 +1633,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+ ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1387,26 +1659,21 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
* need async punt anyway, so it's more efficient to do it
* here.
*/
- if (force_nonblock && ret2 > 0 && ret2 < read_size)
+ if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
+ (req->flags & REQ_F_ISREG) &&
+ ret2 > 0 && ret2 < read_size)
ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
- if (!force_nonblock || ret2 != -EAGAIN) {
- io_rw_done(kiocb, ret2);
- } else {
- /*
- * If ->needs_lock is true, we're already in async
- * context.
- */
- if (!s->needs_lock)
- io_async_list_note(READ, req, iov_count);
+ if (!force_nonblock || ret2 != -EAGAIN)
+ kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+ else
ret = -EAGAIN;
- }
}
kfree(iovec);
return ret;
}
-static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
+static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -1416,7 +1683,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
size_t iov_count;
ssize_t ret;
- ret = io_prep_rw(req, s, force_nonblock);
+ ret = io_prep_rw(req, force_nonblock);
if (ret)
return ret;
@@ -1424,7 +1691,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
if (unlikely(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
- ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+ ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1434,12 +1701,8 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
iov_count = iov_iter_count(&iter);
ret = -EAGAIN;
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
- /* If ->needs_lock is true, we're already in async context. */
- if (!s->needs_lock)
- io_async_list_note(WRITE, req, iov_count);
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
goto out_free;
- }
ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
if (!ret) {
@@ -1452,7 +1715,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
* released so that it doesn't complain about the held lock when
* we return to userspace.
*/
- if (S_ISREG(file_inode(file)->i_mode)) {
+ if (req->flags & REQ_F_ISREG) {
__sb_start_write(file_inode(file)->i_sb,
SB_FREEZE_WRITE, true);
__sb_writers_release(file_inode(file)->i_sb,
@@ -1464,17 +1727,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
ret2 = call_write_iter(file, kiocb, &iter);
else
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
- if (!force_nonblock || ret2 != -EAGAIN) {
- io_rw_done(kiocb, ret2);
- } else {
- /*
- * If ->needs_lock is true, we're already in async
- * context.
- */
- if (!s->needs_lock)
- io_async_list_note(WRITE, req, iov_count);
+ if (!force_nonblock || ret2 != -EAGAIN)
+ kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+ else
ret = -EAGAIN;
- }
}
out_free:
kfree(iovec);
@@ -1484,15 +1740,14 @@ out_free:
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
-static int io_nop(struct io_kiocb *req, u64 user_data)
+static int io_nop(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- long err = 0;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- io_cqring_add_event(ctx, user_data, err);
+ io_cqring_add_event(req, 0);
io_put_req(req);
return 0;
}
@@ -1513,7 +1768,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ struct io_kiocb **nxt, bool force_nonblock)
{
loff_t sqe_off = READ_ONCE(sqe->off);
loff_t sqe_len = READ_ONCE(sqe->len);
@@ -1539,8 +1794,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
- io_put_req(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
return 0;
}
@@ -1562,6 +1817,7 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_sync_file_range(struct io_kiocb *req,
const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt,
bool force_nonblock)
{
loff_t sqe_off;
@@ -1585,14 +1841,14 @@ static int io_sync_file_range(struct io_kiocb *req,
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
- io_put_req(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
return 0;
}
#if defined(CONFIG_NET)
static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock,
+ struct io_kiocb **nxt, bool force_nonblock,
long (*fn)(struct socket *, struct user_msghdr __user *,
unsigned int))
{
@@ -1621,32 +1877,80 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
}
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
- io_put_req(req);
+ io_cqring_add_event(req, ret);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req_find_next(req, nxt);
return 0;
}
#endif
static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ struct io_kiocb **nxt, bool force_nonblock)
{
#if defined(CONFIG_NET)
- return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+ return io_send_recvmsg(req, sqe, nxt, force_nonblock,
+ __sys_sendmsg_sock);
#else
return -EOPNOTSUPP;
#endif
}
static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ return io_send_recvmsg(req, sqe, nxt, force_nonblock,
+ __sys_recvmsg_sock);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt, bool force_nonblock)
{
#if defined(CONFIG_NET)
- return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
+ struct sockaddr __user *addr;
+ int __user *addr_len;
+ unsigned file_flags;
+ int flags, ret;
+
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ return -EINVAL;
+
+ addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+ addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2);
+ flags = READ_ONCE(sqe->accept_flags);
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+ ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags);
+ if (ret == -EAGAIN && force_nonblock) {
+ req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+ return -EAGAIN;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
#else
return -EOPNOTSUPP;
#endif
}
+static inline void io_poll_remove_req(struct io_kiocb *req)
+{
+ if (!RB_EMPTY_NODE(&req->rb_node)) {
+ rb_erase(&req->rb_node, &req->ctx->cancel_tree);
+ RB_CLEAR_NODE(&req->rb_node);
+ }
+}
+
static void io_poll_remove_one(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
@@ -1655,25 +1959,47 @@ static void io_poll_remove_one(struct io_kiocb *req)
WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
- io_queue_async_work(req->ctx, req);
+ io_queue_async_work(req);
}
spin_unlock(&poll->head->lock);
-
- list_del_init(&req->list);
+ io_poll_remove_req(req);
}
static void io_poll_remove_all(struct io_ring_ctx *ctx)
{
+ struct rb_node *node;
struct io_kiocb *req;
spin_lock_irq(&ctx->completion_lock);
- while (!list_empty(&ctx->cancel_list)) {
- req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+ while ((node = rb_first(&ctx->cancel_tree)) != NULL) {
+ req = rb_entry(node, struct io_kiocb, rb_node);
io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
}
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+{
+ struct rb_node *p, *parent = NULL;
+ struct io_kiocb *req;
+
+ p = ctx->cancel_tree.rb_node;
+ while (p) {
+ parent = p;
+ req = rb_entry(parent, struct io_kiocb, rb_node);
+ if (sqe_addr < req->user_data) {
+ p = p->rb_left;
+ } else if (sqe_addr > req->user_data) {
+ p = p->rb_right;
+ } else {
+ io_poll_remove_one(req);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
/*
* Find a running poll command that matches one specified in sqe->addr,
* and remove it if found.
@@ -1681,8 +2007,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *poll_req, *next;
- int ret = -ENOENT;
+ int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -1691,36 +2016,38 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
- if (READ_ONCE(sqe->addr) == poll_req->user_data) {
- io_poll_remove_one(poll_req);
- ret = 0;
- break;
- }
- }
+ ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr));
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
+ io_cqring_add_event(req, ret);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
return 0;
}
-static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
- __poll_t mask)
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
req->poll.done = true;
- io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
+ io_cqring_fill_event(req, mangle_poll(mask));
io_commit_cqring(ctx);
}
-static void io_poll_complete_work(struct work_struct *work)
+static void io_poll_complete_work(struct io_wq_work **workptr)
{
+ struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_poll_iocb *poll = &req->poll;
struct poll_table_struct pt = { ._key = poll->events };
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *nxt = NULL;
__poll_t mask = 0;
+ if (work->flags & IO_WQ_WORK_CANCEL)
+ WRITE_ONCE(poll->canceled, true);
+
if (!READ_ONCE(poll->canceled))
mask = vfs_poll(poll->file, &pt) & poll->events;
@@ -1737,12 +2064,15 @@ static void io_poll_complete_work(struct work_struct *work)
spin_unlock_irq(&ctx->completion_lock);
return;
}
- list_del_init(&req->list);
- io_poll_complete(ctx, req, mask);
+ io_poll_remove_req(req);
+ io_poll_complete(req, mask);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_put_req(req);
+
+ io_put_req_find_next(req, &nxt);
+ if (nxt)
+ *workptr = &nxt->work;
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1761,15 +2091,22 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del_init(&poll->wait.entry);
+ /*
+ * Run completion inline if we can. We're using trylock here because
+ * we are violating the completion_lock -> poll wq lock ordering.
+ * If we have a link timeout we're going to need the completion_lock
+ * for finalizing the request, mark us as having grabbed that already.
+ */
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
- list_del(&req->list);
- io_poll_complete(ctx, req, mask);
+ io_poll_remove_req(req);
+ io_poll_complete(req, mask);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
- io_put_req(req);
} else {
- io_queue_async_work(ctx, req);
+ io_queue_async_work(req);
}
return 1;
@@ -1796,7 +2133,27 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
add_wait_queue(head, &pt->req->poll.wait);
}
-static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct rb_node **p = &ctx->cancel_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct io_kiocb *tmp;
+
+ while (*p) {
+ parent = *p;
+ tmp = rb_entry(parent, struct io_kiocb, rb_node);
+ if (req->user_data < tmp->user_data)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&req->rb_node, parent, p);
+ rb_insert_color(&req->rb_node, &ctx->cancel_tree);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt)
{
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
@@ -1813,9 +2170,10 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EBADF;
req->submit.sqe = NULL;
- INIT_WORK(&req->work, io_poll_complete_work);
+ INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+ RB_CLEAR_NODE(&req->rb_node);
poll->head = NULL;
poll->done = false;
@@ -1848,18 +2206,18 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
else if (cancel)
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
- list_add_tail(&req->list, &ctx->cancel_list);
+ io_poll_req_insert(req);
spin_unlock(&poll->head->lock);
}
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- io_poll_complete(ctx, req, mask);
+ io_poll_complete(req, mask);
}
spin_unlock_irq(&ctx->completion_lock);
if (mask) {
io_cqring_ev_posted(ctx);
- io_put_req(req);
+ io_put_req_find_next(req, nxt);
}
return ipt.error;
}
@@ -1875,76 +2233,265 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
atomic_inc(&ctx->cq_timeouts);
spin_lock_irqsave(&ctx->completion_lock, flags);
- list_del(&req->list);
+ /*
+ * We could be racing with timeout deletion. If the list is empty,
+ * then timeout lookup already found it and will be handling it.
+ */
+ if (!list_empty(&req->list)) {
+ struct io_kiocb *prev;
- io_cqring_fill_event(ctx, req->user_data, -ETIME);
+ /*
+ * Adjust the reqs sequence before the current one because it
+ * will consume a slot in the cq_ring and the the cq_tail
+ * pointer will be increased, otherwise other timeout reqs may
+ * return in advance without waiting for enough wait_nr.
+ */
+ prev = req;
+ list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
+ prev->sequence++;
+ list_del_init(&req->list);
+ }
+
+ io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
-
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
return HRTIMER_NORESTART;
}
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+ struct io_kiocb *req;
+ int ret = -ENOENT;
+
+ list_for_each_entry(req, &ctx->timeout_list, list) {
+ if (user_data == req->user_data) {
+ list_del_init(&req->list);
+ ret = 0;
+ break;
+ }
+ }
+
+ if (ret == -ENOENT)
+ return ret;
+
+ ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ if (ret == -1)
+ return -EALREADY;
+
+ io_cqring_fill_event(req, -ECANCELED);
+ io_put_req(req);
+ return 0;
+}
+
+/*
+ * Remove or update an existing timeout command
+ */
+static int io_timeout_remove(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned flags;
+ int ret;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->timeout_flags);
+ if (flags)
+ return -EINVAL;
+
+ spin_lock_irq(&ctx->completion_lock);
+ ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr));
+
+ io_cqring_fill_event(req, ret);
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+ if (ret < 0 && req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req(req);
+ return 0;
+}
+
static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- unsigned count, req_dist, tail_index;
+ unsigned count;
struct io_ring_ctx *ctx = req->ctx;
struct list_head *entry;
+ enum hrtimer_mode mode;
struct timespec64 ts;
+ unsigned span = 0;
+ unsigned flags;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
- sqe->len != 1)
+ if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->timeout_flags);
+ if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL;
if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
+ if (flags & IORING_TIMEOUT_ABS)
+ mode = HRTIMER_MODE_ABS;
+ else
+ mode = HRTIMER_MODE_REL;
+
+ hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
+ req->flags |= REQ_F_TIMEOUT;
+
/*
* sqe->off holds how many events that need to occur for this
- * timeout event to be satisfied.
+ * timeout event to be satisfied. If it isn't set, then this is
+ * a pure timeout request, sequence isn't used.
*/
count = READ_ONCE(sqe->off);
- if (!count)
- count = 1;
+ if (!count) {
+ req->flags |= REQ_F_TIMEOUT_NOSEQ;
+ spin_lock_irq(&ctx->completion_lock);
+ entry = ctx->timeout_list.prev;
+ goto add;
+ }
req->sequence = ctx->cached_sq_head + count - 1;
- req->flags |= REQ_F_TIMEOUT;
+ /* reuse it to store the count */
+ req->submit.sequence = count;
/*
* Insertion sort, ensuring the first entry in the list is always
* the one we need first.
*/
- tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped;
- req_dist = req->sequence - tail_index;
spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned dist;
+ unsigned nxt_sq_head;
+ long long tmp, tmp_nxt;
+
+ if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
+ continue;
+
+ /*
+ * Since cached_sq_head + count - 1 can overflow, use type long
+ * long to store it.
+ */
+ tmp = (long long)ctx->cached_sq_head + count - 1;
+ nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
+ tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
+
+ /*
+ * cached_sq_head may overflow, and it will never overflow twice
+ * once there is some timeout req still be valid.
+ */
+ if (ctx->cached_sq_head < nxt_sq_head)
+ tmp += UINT_MAX;
- dist = nxt->sequence - tail_index;
- if (req_dist >= dist)
+ if (tmp > tmp_nxt)
break;
+
+ /*
+ * Sequence of reqs after the insert one and itself should
+ * be adjusted because each timeout req consumes a slot.
+ */
+ span++;
+ nxt->sequence++;
}
+ req->sequence -= span;
+add:
list_add(&req->list, entry);
+ req->timeout.timer.function = io_timeout_fn;
+ hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
spin_unlock_irq(&ctx->completion_lock);
+ return 0;
+}
- hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- req->timeout.timer.function = io_timeout_fn;
- hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
- HRTIMER_MODE_REL);
+static bool io_cancel_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ return req->user_data == (unsigned long) data;
+}
+
+static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+{
+ enum io_wq_cancel cancel_ret;
+ int ret = 0;
+
+ cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+ switch (cancel_ret) {
+ case IO_WQ_CANCEL_OK:
+ ret = 0;
+ break;
+ case IO_WQ_CANCEL_RUNNING:
+ ret = -EALREADY;
+ break;
+ case IO_WQ_CANCEL_NOTFOUND:
+ ret = -ENOENT;
+ break;
+ }
+
+ return ret;
+}
+
+static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
+ struct io_kiocb *req, __u64 sqe_addr,
+ struct io_kiocb **nxt)
+{
+ unsigned long flags;
+ int ret;
+
+ ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+ if (ret != -ENOENT) {
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ goto done;
+ }
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ ret = io_timeout_cancel(ctx, sqe_addr);
+ if (ret != -ENOENT)
+ goto done;
+ ret = io_poll_cancel(ctx, sqe_addr);
+done:
+ io_cqring_fill_event(req, ret);
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ io_cqring_ev_posted(ctx);
+
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req_find_next(req, nxt);
+}
+
+static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
+ sqe->cancel_flags)
+ return -EINVAL;
+
+ io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL);
return 0;
}
-static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_req_defer(struct io_kiocb *req)
{
+ const struct io_uring_sqe *sqe = req->submit.sqe;
struct io_uring_sqe *sqe_copy;
+ struct io_ring_ctx *ctx = req->ctx;
- if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
+ /* Still need defer if there is pending req in defer list. */
+ if (!req_need_defer(req) && list_empty(&ctx->defer_list))
return 0;
sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
@@ -1952,7 +2499,7 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
return -EAGAIN;
spin_lock_irq(&ctx->completion_lock);
- if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
+ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
kfree(sqe_copy);
return 0;
@@ -1961,64 +2508,70 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
req->submit.sqe = sqe_copy;
- INIT_WORK(&req->work, io_sq_wq_submit_work);
+ trace_io_uring_defer(ctx, req, false);
list_add_tail(&req->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
return -EIOCBQUEUED;
}
-static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct sqe_submit *s, bool force_nonblock)
+static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
{
int ret, opcode;
-
- req->user_data = READ_ONCE(s->sqe->user_data);
-
- if (unlikely(s->index >= ctx->sq_entries))
- return -EINVAL;
+ struct sqe_submit *s = &req->submit;
+ struct io_ring_ctx *ctx = req->ctx;
opcode = READ_ONCE(s->sqe->opcode);
switch (opcode) {
case IORING_OP_NOP:
- ret = io_nop(req, req->user_data);
+ ret = io_nop(req);
break;
case IORING_OP_READV:
if (unlikely(s->sqe->buf_index))
return -EINVAL;
- ret = io_read(req, s, force_nonblock);
+ ret = io_read(req, nxt, force_nonblock);
break;
case IORING_OP_WRITEV:
if (unlikely(s->sqe->buf_index))
return -EINVAL;
- ret = io_write(req, s, force_nonblock);
+ ret = io_write(req, nxt, force_nonblock);
break;
case IORING_OP_READ_FIXED:
- ret = io_read(req, s, force_nonblock);
+ ret = io_read(req, nxt, force_nonblock);
break;
case IORING_OP_WRITE_FIXED:
- ret = io_write(req, s, force_nonblock);
+ ret = io_write(req, nxt, force_nonblock);
break;
case IORING_OP_FSYNC:
- ret = io_fsync(req, s->sqe, force_nonblock);
+ ret = io_fsync(req, s->sqe, nxt, force_nonblock);
break;
case IORING_OP_POLL_ADD:
- ret = io_poll_add(req, s->sqe);
+ ret = io_poll_add(req, s->sqe, nxt);
break;
case IORING_OP_POLL_REMOVE:
ret = io_poll_remove(req, s->sqe);
break;
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, s->sqe, force_nonblock);
+ ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock);
break;
case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, s->sqe, force_nonblock);
+ ret = io_sendmsg(req, s->sqe, nxt, force_nonblock);
break;
case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, s->sqe, force_nonblock);
+ ret = io_recvmsg(req, s->sqe, nxt, force_nonblock);
break;
case IORING_OP_TIMEOUT:
ret = io_timeout(req, s->sqe);
break;
+ case IORING_OP_TIMEOUT_REMOVE:
+ ret = io_timeout_remove(req, s->sqe);
+ break;
+ case IORING_OP_ACCEPT:
+ ret = io_accept(req, s->sqe, nxt, force_nonblock);
+ break;
+ case IORING_OP_ASYNC_CANCEL:
+ ret = io_async_cancel(req, s->sqe, nxt);
+ break;
default:
ret = -EINVAL;
break;
@@ -2032,187 +2585,65 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return -EAGAIN;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (s->needs_lock)
+ if (s->in_async)
mutex_lock(&ctx->uring_lock);
io_iopoll_req_issued(req);
- if (s->needs_lock)
+ if (s->in_async)
mutex_unlock(&ctx->uring_lock);
}
return 0;
}
-static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
- const struct io_uring_sqe *sqe)
-{
- switch (sqe->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- return &ctx->pending_async[READ];
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- return &ctx->pending_async[WRITE];
- default:
- return NULL;
- }
-}
-
-static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
-{
- u8 opcode = READ_ONCE(sqe->opcode);
-
- return !(opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED);
-}
-
-static void io_sq_wq_submit_work(struct work_struct *work)
+static void io_wq_submit_work(struct io_wq_work **workptr)
{
+ struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_ring_ctx *ctx = req->ctx;
- struct mm_struct *cur_mm = NULL;
- struct async_list *async_list;
- LIST_HEAD(req_list);
- mm_segment_t old_fs;
- int ret;
+ struct sqe_submit *s = &req->submit;
+ const struct io_uring_sqe *sqe = s->sqe;
+ struct io_kiocb *nxt = NULL;
+ int ret = 0;
- async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
-restart:
- do {
- struct sqe_submit *s = &req->submit;
- const struct io_uring_sqe *sqe = s->sqe;
- unsigned int flags = req->flags;
+ /* Ensure we clear previously set non-block flag */
+ req->rw.ki_flags &= ~IOCB_NOWAIT;
- /* Ensure we clear previously set non-block flag */
- req->rw.ki_flags &= ~IOCB_NOWAIT;
+ if (work->flags & IO_WQ_WORK_CANCEL)
+ ret = -ECANCELED;
- ret = 0;
- if (io_sqe_needs_user(sqe) && !cur_mm) {
- if (!mmget_not_zero(ctx->sqo_mm)) {
- ret = -EFAULT;
- } else {
- cur_mm = ctx->sqo_mm;
- use_mm(cur_mm);
- old_fs = get_fs();
- set_fs(USER_DS);
- }
- }
+ if (!ret) {
+ s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
+ s->in_async = true;
+ do {
+ ret = __io_submit_sqe(req, &nxt, false);
+ /*
+ * We can get EAGAIN for polled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (ret != -EAGAIN)
+ break;
+ cond_resched();
+ } while (1);
+ }
- if (!ret) {
- s->has_user = cur_mm != NULL;
- s->needs_lock = true;
- do {
- ret = __io_submit_sqe(ctx, req, s, false);
- /*
- * We can get EAGAIN for polled IO even though
- * we're forcing a sync submission from here,
- * since we can't wait for request slots on the
- * block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
- }
+ /* drop submission reference */
+ io_put_req(req);
- /* drop submission reference */
+ if (ret) {
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_add_event(req, ret);
io_put_req(req);
-
- if (ret) {
- io_cqring_add_event(ctx, sqe->user_data, ret);
- io_put_req(req);
- }
-
- /* async context always use a copy of the sqe */
- kfree(sqe);
-
- /* req from defer and link list needn't decrease async cnt */
- if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
- goto out;
-
- if (!async_list)
- break;
- if (!list_empty(&req_list)) {
- req = list_first_entry(&req_list, struct io_kiocb,
- list);
- list_del(&req->list);
- continue;
- }
- if (list_empty(&async_list->list))
- break;
-
- req = NULL;
- spin_lock(&async_list->lock);
- if (list_empty(&async_list->list)) {
- spin_unlock(&async_list->lock);
- break;
- }
- list_splice_init(&async_list->list, &req_list);
- spin_unlock(&async_list->lock);
-
- req = list_first_entry(&req_list, struct io_kiocb, list);
- list_del(&req->list);
- } while (req);
-
- /*
- * Rare case of racing with a submitter. If we find the count has
- * dropped to zero AND we have pending work items, then restart
- * the processing. This is a tiny race window.
- */
- if (async_list) {
- ret = atomic_dec_return(&async_list->cnt);
- while (!ret && !list_empty(&async_list->list)) {
- spin_lock(&async_list->lock);
- atomic_inc(&async_list->cnt);
- list_splice_init(&async_list->list, &req_list);
- spin_unlock(&async_list->lock);
-
- if (!list_empty(&req_list)) {
- req = list_first_entry(&req_list,
- struct io_kiocb, list);
- list_del(&req->list);
- goto restart;
- }
- ret = atomic_dec_return(&async_list->cnt);
- }
}
-out:
- if (cur_mm) {
- set_fs(old_fs);
- unuse_mm(cur_mm);
- mmput(cur_mm);
- }
-}
-
-/*
- * See if we can piggy back onto previously submitted work, that is still
- * running. We currently only allow this if the new request is sequential
- * to the previous one we punted.
- */
-static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
-{
- bool ret;
-
- if (!list)
- return false;
- if (!(req->flags & REQ_F_SEQ_PREV))
- return false;
- if (!atomic_read(&list->cnt))
- return false;
+ /* async context always use a copy of the sqe */
+ kfree(sqe);
- ret = true;
- spin_lock(&list->lock);
- list_add_tail(&req->list, &list->list);
- /*
- * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
- */
- smp_mb();
- if (!atomic_read(&list->cnt)) {
- list_del_init(&req->list);
- ret = false;
+ /* if a dependent link is ready, pass it back */
+ if (!ret && nxt) {
+ io_prep_async_work(nxt);
+ *workptr = &nxt->work;
}
- spin_unlock(&list->lock);
- return ret;
}
static bool io_op_needs_file(const struct io_uring_sqe *sqe)
@@ -2222,15 +2653,29 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe)
switch (op) {
case IORING_OP_NOP:
case IORING_OP_POLL_REMOVE:
+ case IORING_OP_TIMEOUT:
+ case IORING_OP_TIMEOUT_REMOVE:
+ case IORING_OP_ASYNC_CANCEL:
+ case IORING_OP_LINK_TIMEOUT:
return false;
default:
return true;
}
}
-static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
- struct io_submit_state *state, struct io_kiocb *req)
+static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
+ int index)
{
+ struct fixed_file_table *table;
+
+ table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
+ return table->files[index & IORING_FILE_TABLE_MASK];
+}
+
+static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
+{
+ struct sqe_submit *s = &req->submit;
+ struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
int fd;
@@ -2250,14 +2695,18 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
return 0;
if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files ||
+ if (unlikely(!ctx->file_table ||
(unsigned) fd >= ctx->nr_user_files))
return -EBADF;
- req->file = ctx->user_files[fd];
+ fd = array_index_nospec(fd, ctx->nr_user_files);
+ req->file = io_file_from_index(ctx, fd);
+ if (!req->file)
+ return -EBADF;
req->flags |= REQ_F_FIXED_FILE;
} else {
if (s->needs_fixed_file)
return -EBADF;
+ trace_io_uring_file_get(ctx, fd);
req->file = io_file_get(state, fd);
if (unlikely(!req->file))
return -EBADF;
@@ -2266,43 +2715,194 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
return 0;
}
-static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s, bool force_nonblock)
+static int io_grab_files(struct io_kiocb *req)
+{
+ int ret = -EBADF;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ rcu_read_lock();
+ spin_lock_irq(&ctx->inflight_lock);
+ /*
+ * We use the f_ops->flush() handler to ensure that we can flush
+ * out work accessing these files if the fd is closed. Check if
+ * the fd has changed since we started down this path, and disallow
+ * this operation if it has.
+ */
+ if (fcheck(req->submit.ring_fd) == req->submit.ring_file) {
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ req->flags |= REQ_F_INFLIGHT;
+ req->work.files = current->files;
+ ret = 0;
+ }
+ spin_unlock_irq(&ctx->inflight_lock);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
+{
+ struct io_kiocb *req = container_of(timer, struct io_kiocb,
+ timeout.timer);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *prev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+
+ /*
+ * We don't expect the list to be empty, that will only happen if we
+ * race with the completion of the linked work.
+ */
+ if (!list_empty(&req->list)) {
+ prev = list_entry(req->list.prev, struct io_kiocb, link_list);
+ if (refcount_inc_not_zero(&prev->refs))
+ list_del_init(&req->list);
+ else
+ prev = NULL;
+ }
+
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ if (prev) {
+ io_async_find_and_cancel(ctx, req, prev->user_data, NULL);
+ io_put_req(prev);
+ } else {
+ io_cqring_add_event(req, -ETIME);
+ io_put_req(req);
+ }
+ return HRTIMER_NORESTART;
+}
+
+static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
+ enum hrtimer_mode *mode)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ /*
+ * If the list is now empty, then our linked request finished before
+ * we got a chance to setup the timer
+ */
+ spin_lock_irq(&ctx->completion_lock);
+ if (!list_empty(&req->list)) {
+ req->timeout.timer.function = io_link_timeout_fn;
+ hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts),
+ *mode);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ /* drop submission reference */
+ io_put_req(req);
+}
+
+static int io_validate_link_timeout(const struct io_uring_sqe *sqe,
+ struct timespec64 *ts)
+{
+ if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
+ return -EINVAL;
+ if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
+ return -EINVAL;
+ if (get_timespec64(ts, u64_to_user_ptr(sqe->addr)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
+ struct timespec64 *ts,
+ enum hrtimer_mode *mode)
{
+ struct io_kiocb *nxt;
int ret;
- ret = __io_submit_sqe(ctx, req, s, force_nonblock);
- if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ if (!(req->flags & REQ_F_LINK))
+ return NULL;
+
+ nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
+ if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
+ return NULL;
+
+ ret = io_validate_link_timeout(nxt->submit.sqe, ts);
+ if (ret) {
+ list_del_init(&nxt->list);
+ io_cqring_add_event(nxt, ret);
+ io_double_put_req(nxt);
+ return ERR_PTR(-ECANCELED);
+ }
+
+ if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS)
+ *mode = HRTIMER_MODE_ABS;
+ else
+ *mode = HRTIMER_MODE_REL;
+
+ req->flags |= REQ_F_LINK_TIMEOUT;
+ hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode);
+ return nxt;
+}
+
+static int __io_queue_sqe(struct io_kiocb *req)
+{
+ enum hrtimer_mode mode;
+ struct io_kiocb *nxt;
+ struct timespec64 ts;
+ int ret;
+
+ nxt = io_prep_linked_timeout(req, &ts, &mode);
+ if (IS_ERR(nxt)) {
+ ret = PTR_ERR(nxt);
+ nxt = NULL;
+ goto err;
+ }
+
+ ret = __io_submit_sqe(req, NULL, true);
+
+ /*
+ * We async punt it if the file wasn't marked NOWAIT, or if the file
+ * doesn't support non-blocking read/write attempts
+ */
+ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
+ (req->flags & REQ_F_MUST_PUNT))) {
+ struct sqe_submit *s = &req->submit;
struct io_uring_sqe *sqe_copy;
sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
if (sqe_copy) {
- struct async_list *list;
-
s->sqe = sqe_copy;
- memcpy(&req->submit, s, sizeof(*s));
- list = io_async_list_from_sqe(ctx, s->sqe);
- if (!io_add_to_prev_work(list, req)) {
- if (list)
- atomic_inc(&list->cnt);
- INIT_WORK(&req->work, io_sq_wq_submit_work);
- io_queue_async_work(ctx, req);
+ if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+ ret = io_grab_files(req);
+ if (ret) {
+ kfree(sqe_copy);
+ goto err;
+ }
}
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
+ io_queue_async_work(req);
+
+ if (nxt)
+ io_queue_linked_timeout(nxt, &ts, &mode);
+
return 0;
}
}
+err:
/* drop submission reference */
io_put_req(req);
+ if (nxt) {
+ if (!ret)
+ io_queue_linked_timeout(nxt, &ts, &mode);
+ else
+ io_put_req(nxt);
+ }
+
/* and drop final reference, if we failed */
if (ret) {
- io_cqring_add_event(ctx, req->user_data, ret);
+ io_cqring_add_event(req, ret);
if (req->flags & REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
@@ -2311,32 +2911,30 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return ret;
}
-static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s, bool force_nonblock)
+static int io_queue_sqe(struct io_kiocb *req)
{
int ret;
- ret = io_req_defer(ctx, req, s->sqe);
+ ret = io_req_defer(req);
if (ret) {
if (ret != -EIOCBQUEUED) {
- io_free_req(req);
- io_cqring_add_event(ctx, s->sqe->user_data, ret);
+ io_cqring_add_event(req, ret);
+ io_double_put_req(req);
}
return 0;
}
- return __io_queue_sqe(ctx, req, s, force_nonblock);
+ return __io_queue_sqe(req);
}
-static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s, struct io_kiocb *shadow,
- bool force_nonblock)
+static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
{
int ret;
int need_submit = false;
+ struct io_ring_ctx *ctx = req->ctx;
if (!shadow)
- return io_queue_sqe(ctx, req, s, force_nonblock);
+ return io_queue_sqe(req);
/*
* Mark the first IO in link list as DRAIN, let all the following
@@ -2344,11 +2942,12 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
* list.
*/
req->flags |= REQ_F_IO_DRAIN;
- ret = io_req_defer(ctx, req, s->sqe);
+ ret = io_req_defer(req);
if (ret) {
if (ret != -EIOCBQUEUED) {
- io_free_req(req);
- io_cqring_add_event(ctx, s->sqe->user_data, ret);
+ io_cqring_add_event(req, ret);
+ io_double_put_req(req);
+ __io_free_req(shadow);
return 0;
}
} else {
@@ -2361,43 +2960,39 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* Insert shadow req to defer_list, blocking next IOs */
spin_lock_irq(&ctx->completion_lock);
+ trace_io_uring_defer(ctx, shadow, true);
list_add_tail(&shadow->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
if (need_submit)
- return __io_queue_sqe(ctx, req, s, force_nonblock);
+ return __io_queue_sqe(req);
return 0;
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
-static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
- struct io_submit_state *state, struct io_kiocb **link,
- bool force_nonblock)
+static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
+ struct io_kiocb **link)
{
struct io_uring_sqe *sqe_copy;
- struct io_kiocb *req;
+ struct sqe_submit *s = &req->submit;
+ struct io_ring_ctx *ctx = req->ctx;
int ret;
+ req->user_data = s->sqe->user_data;
+
/* enforce forwards compatibility on users */
if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
- goto err;
- }
-
- req = io_get_req(ctx, state);
- if (unlikely(!req)) {
- ret = -EAGAIN;
- goto err;
+ goto err_req;
}
- ret = io_req_set_file(ctx, s, state, req);
+ ret = io_req_set_file(state, req);
if (unlikely(ret)) {
err_req:
- io_free_req(req);
-err:
- io_cqring_add_event(ctx, s->sqe->user_data, ret);
+ io_cqring_add_event(req, ret);
+ io_double_put_req(req);
return;
}
@@ -2418,16 +3013,19 @@ err:
}
s->sqe = sqe_copy;
- memcpy(&req->submit, s, sizeof(*s));
+ trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->list, &prev->link_list);
} else if (s->sqe->flags & IOSQE_IO_LINK) {
req->flags |= REQ_F_LINK;
- memcpy(&req->submit, s, sizeof(*s));
INIT_LIST_HEAD(&req->link_list);
*link = req;
+ } else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+ /* Only valid as a linked SQE */
+ ret = -EINVAL;
+ goto err_req;
} else {
- io_queue_sqe(ctx, req, s, force_nonblock);
+ io_queue_sqe(req);
}
}
@@ -2498,7 +3096,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
head = READ_ONCE(sq_array[head & ctx->sq_mask]);
if (head < ctx->sq_entries) {
- s->index = head;
+ s->ring_file = NULL;
s->sqe = &ctx->sq_sqes[head];
s->sequence = ctx->cached_sq_head;
ctx->cached_sq_head++;
@@ -2507,18 +3105,25 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
/* drop invalid entries */
ctx->cached_sq_head++;
- rings->sq_dropped++;
+ ctx->cached_sq_dropped++;
+ WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
return false;
}
-static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
- unsigned int nr, bool has_user, bool mm_fault)
+static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
+ struct file *ring_file, int ring_fd,
+ struct mm_struct **mm, bool async)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
struct io_kiocb *shadow_req = NULL;
- bool prev_was_link = false;
int i, submitted = 0;
+ bool mm_fault = false;
+
+ if (!list_empty(&ctx->cq_overflow_list)) {
+ io_cqring_overflow_flush(ctx, false);
+ return -EBUSY;
+ }
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, nr);
@@ -2526,19 +3131,31 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
}
for (i = 0; i < nr; i++) {
- /*
- * If previous wasn't linked and we have a linked command,
- * that's the end of the chain. Submit the previous link.
- */
- if (!prev_was_link && link) {
- io_queue_link_head(ctx, link, &link->submit, shadow_req,
- true);
- link = NULL;
- shadow_req = NULL;
+ struct io_kiocb *req;
+ unsigned int sqe_flags;
+
+ req = io_get_req(ctx, statep);
+ if (unlikely(!req)) {
+ if (!submitted)
+ submitted = -EAGAIN;
+ break;
+ }
+ if (!io_get_sqring(ctx, &req->submit)) {
+ __io_free_req(req);
+ break;
+ }
+
+ if (io_sqe_needs_user(req->submit.sqe) && !*mm) {
+ mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
+ if (!mm_fault) {
+ use_mm(ctx->sqo_mm);
+ *mm = ctx->sqo_mm;
+ }
}
- prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
- if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) {
+ sqe_flags = req->submit.sqe->flags;
+
+ if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
if (!shadow_req) {
shadow_req = io_get_req(ctx, NULL);
if (unlikely(!shadow_req))
@@ -2546,55 +3163,79 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
refcount_dec(&shadow_req->refs);
}
- shadow_req->sequence = sqes[i].sequence;
+ shadow_req->sequence = req->submit.sequence;
}
out:
- if (unlikely(mm_fault)) {
- io_cqring_add_event(ctx, sqes[i].sqe->user_data,
- -EFAULT);
- } else {
- sqes[i].has_user = has_user;
- sqes[i].needs_lock = true;
- sqes[i].needs_fixed_file = true;
- io_submit_sqe(ctx, &sqes[i], statep, &link, true);
- submitted++;
+ req->submit.ring_file = ring_file;
+ req->submit.ring_fd = ring_fd;
+ req->submit.has_user = *mm != NULL;
+ req->submit.in_async = async;
+ req->submit.needs_fixed_file = async;
+ trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data,
+ true, async);
+ io_submit_sqe(req, statep, &link);
+ submitted++;
+
+ /*
+ * If previous wasn't linked and we have a linked command,
+ * that's the end of the chain. Submit the previous link.
+ */
+ if (!(sqe_flags & IOSQE_IO_LINK) && link) {
+ io_queue_link_head(link, shadow_req);
+ link = NULL;
+ shadow_req = NULL;
}
}
if (link)
- io_queue_link_head(ctx, link, &link->submit, shadow_req, true);
+ io_queue_link_head(link, shadow_req);
if (statep)
io_submit_state_end(&state);
+ /* Commit SQ ring head once we've consumed and submitted all SQEs */
+ io_commit_sqring(ctx);
+
return submitted;
}
static int io_sq_thread(void *data)
{
- struct sqe_submit sqes[IO_IOPOLL_BATCH];
struct io_ring_ctx *ctx = data;
struct mm_struct *cur_mm = NULL;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned inflight;
unsigned long timeout;
+ int ret;
- complete(&ctx->sqo_thread_started);
+ complete(&ctx->completions[1]);
old_fs = get_fs();
set_fs(USER_DS);
- timeout = inflight = 0;
+ ret = timeout = inflight = 0;
while (!kthread_should_park()) {
- bool all_fixed, mm_fault = false;
- int i;
+ unsigned int to_submit;
if (inflight) {
unsigned nr_events = 0;
if (ctx->flags & IORING_SETUP_IOPOLL) {
- io_iopoll_check(ctx, &nr_events, 0);
+ /*
+ * inflight is the count of the maximum possible
+ * entries we submitted, but it can be smaller
+ * if we dropped some of them. If we don't have
+ * poll entries available, then we know that we
+ * have nothing left to poll for. Reset the
+ * inflight count to zero in that case.
+ */
+ mutex_lock(&ctx->uring_lock);
+ if (!list_empty(&ctx->poll_list))
+ __io_iopoll_check(ctx, &nr_events, 0);
+ else
+ inflight = 0;
+ mutex_unlock(&ctx->uring_lock);
} else {
/*
* Normal IO, just pretend everything completed.
@@ -2608,13 +3249,22 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle;
}
- if (!io_get_sqring(ctx, &sqes[0])) {
+ to_submit = io_sqring_entries(ctx);
+
+ /*
+ * If submit got -EBUSY, flag us as needing the application
+ * to enter the kernel to reap and flush events.
+ */
+ if (!to_submit || ret == -EBUSY) {
/*
* We're polling. If we're within the defined idle
* period, then let us spin without work before going
- * to sleep.
+ * to sleep. The exception is if we got EBUSY doing
+ * more IO, we should wait for the application to
+ * reap events and wake us up.
*/
- if (inflight || !time_after(jiffies, timeout)) {
+ if (inflight ||
+ (!time_after(jiffies, timeout) && ret != -EBUSY)) {
cond_resched();
continue;
}
@@ -2639,7 +3289,8 @@ static int io_sq_thread(void *data)
/* make sure to read SQ tail after writing flags */
smp_mb();
- if (!io_get_sqring(ctx, &sqes[0])) {
+ to_submit = io_sqring_entries(ctx);
+ if (!to_submit || ret == -EBUSY) {
if (kthread_should_park()) {
finish_wait(&ctx->sqo_wait, &wait);
break;
@@ -2657,31 +3308,10 @@ static int io_sq_thread(void *data)
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
}
- i = 0;
- all_fixed = true;
- do {
- if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
- all_fixed = false;
-
- i++;
- if (i == ARRAY_SIZE(sqes))
- break;
- } while (io_get_sqring(ctx, &sqes[i]));
-
- /* Unless all new commands are FIXED regions, grab mm */
- if (!all_fixed && !cur_mm) {
- mm_fault = !mmget_not_zero(ctx->sqo_mm);
- if (!mm_fault) {
- use_mm(ctx->sqo_mm);
- cur_mm = ctx->sqo_mm;
- }
- }
-
- inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
- mm_fault);
-
- /* Commit SQ ring head once we've consumed all SQEs */
- io_commit_sqring(ctx);
+ to_submit = min(to_submit, ctx->sq_entries);
+ ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+ if (ret > 0)
+ inflight += ret;
}
set_fs(old_fs);
@@ -2695,79 +3325,6 @@ static int io_sq_thread(void *data)
return 0;
}
-static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
- bool block_for_last)
-{
- struct io_submit_state state, *statep = NULL;
- struct io_kiocb *link = NULL;
- struct io_kiocb *shadow_req = NULL;
- bool prev_was_link = false;
- int i, submit = 0;
-
- if (to_submit > IO_PLUG_THRESHOLD) {
- io_submit_state_start(&state, ctx, to_submit);
- statep = &state;
- }
-
- for (i = 0; i < to_submit; i++) {
- bool force_nonblock = true;
- struct sqe_submit s;
-
- if (!io_get_sqring(ctx, &s))
- break;
-
- /*
- * If previous wasn't linked and we have a linked command,
- * that's the end of the chain. Submit the previous link.
- */
- if (!prev_was_link && link) {
- io_queue_link_head(ctx, link, &link->submit, shadow_req,
- force_nonblock);
- link = NULL;
- shadow_req = NULL;
- }
- prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
-
- if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
- if (!shadow_req) {
- shadow_req = io_get_req(ctx, NULL);
- if (unlikely(!shadow_req))
- goto out;
- shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
- refcount_dec(&shadow_req->refs);
- }
- shadow_req->sequence = s.sequence;
- }
-
-out:
- s.has_user = true;
- s.needs_lock = false;
- s.needs_fixed_file = false;
- submit++;
-
- /*
- * The caller will block for events after submit, submit the
- * last IO non-blocking. This is either the only IO it's
- * submitting, or it already submitted the previous ones. This
- * improves performance by avoiding an async punt that we don't
- * need to do.
- */
- if (block_for_last && submit == to_submit)
- force_nonblock = false;
-
- io_submit_sqe(ctx, &s, statep, &link, force_nonblock);
- }
- io_commit_sqring(ctx);
-
- if (link)
- io_queue_link_head(ctx, link, &link->submit, shadow_req,
- block_for_last);
- if (statep)
- io_submit_state_end(statep);
-
- return submit;
-}
-
struct io_wait_queue {
struct wait_queue_entry wq;
struct io_ring_ctx *ctx;
@@ -2775,7 +3332,7 @@ struct io_wait_queue {
unsigned nr_timeouts;
};
-static inline bool io_should_wake(struct io_wait_queue *iowq)
+static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
{
struct io_ring_ctx *ctx = iowq->ctx;
@@ -2784,7 +3341,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
* started waiting. For timeouts, we always want to return to userspace,
* regardless of event count.
*/
- return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+ return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}
@@ -2794,7 +3351,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
wq);
- if (!io_should_wake(iowq))
+ /* use noflush == true, as we can't safely rely on locking context */
+ if (!io_should_wake(iowq, true))
return -1;
return autoremove_wake_function(curr, mode, wake_flags, key);
@@ -2817,9 +3375,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
- int ret;
+ int ret = 0;
- if (io_cqring_events(rings) >= min_events)
+ if (io_cqring_events(ctx, false) >= min_events)
return 0;
if (sig) {
@@ -2835,24 +3393,22 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- ret = 0;
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+ trace_io_uring_cqring_wait(ctx, min_events);
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- if (io_should_wake(&iowq))
+ if (io_should_wake(&iowq, false))
break;
schedule();
if (signal_pending(current)) {
- ret = -ERESTARTSYS;
+ ret = -EINTR;
break;
}
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
- restore_saved_sigmask_unless(ret == -ERESTARTSYS);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ restore_saved_sigmask_unless(ret == -EINTR);
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
@@ -2870,19 +3426,29 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#else
int i;
- for (i = 0; i < ctx->nr_user_files; i++)
- fput(ctx->user_files[i]);
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct file *file;
+
+ file = io_file_from_index(ctx, i);
+ if (file)
+ fput(file);
+ }
#endif
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- if (!ctx->user_files)
+ unsigned nr_tables, i;
+
+ if (!ctx->file_table)
return -ENXIO;
__io_sqe_files_unregister(ctx);
- kfree(ctx->user_files);
- ctx->user_files = NULL;
+ nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
+ for (i = 0; i < nr_tables; i++)
+ kfree(ctx->file_table[i].files);
+ kfree(ctx->file_table);
+ ctx->file_table = NULL;
ctx->nr_user_files = 0;
return 0;
}
@@ -2890,7 +3456,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
static void io_sq_thread_stop(struct io_ring_ctx *ctx)
{
if (ctx->sqo_thread) {
- wait_for_completion(&ctx->sqo_thread_started);
+ wait_for_completion(&ctx->completions[1]);
/*
* The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity
@@ -2904,15 +3470,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
static void io_finish_async(struct io_ring_ctx *ctx)
{
- int i;
-
io_sq_thread_stop(ctx);
- for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
- if (ctx->sqo_wq[i]) {
- destroy_workqueue(ctx->sqo_wq[i]);
- ctx->sqo_wq[i] = NULL;
- }
+ if (ctx->io_wq) {
+ io_wq_destroy(ctx->io_wq);
+ ctx->io_wq = NULL;
}
}
@@ -2921,7 +3483,9 @@ static void io_destruct_skb(struct sk_buff *skb)
{
struct io_ring_ctx *ctx = skb->sk->sk_user_data;
- io_finish_async(ctx);
+ if (ctx->io_wq)
+ io_wq_flush(ctx->io_wq);
+
unix_destruct_scm(skb);
}
@@ -2935,7 +3499,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
struct sock *sk = ctx->ring_sock->sk;
struct scm_fp_list *fpl;
struct sk_buff *skb;
- int i;
+ int i, nr_files;
if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
unsigned long inflight = ctx->user->unix_inflight + nr;
@@ -2955,21 +3519,33 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
}
skb->sk = sk;
- skb->destructor = io_destruct_skb;
+ nr_files = 0;
fpl->user = get_uid(ctx->user);
for (i = 0; i < nr; i++) {
- fpl->fp[i] = get_file(ctx->user_files[i + offset]);
- unix_inflight(fpl->user, fpl->fp[i]);
+ struct file *file = io_file_from_index(ctx, i + offset);
+
+ if (!file)
+ continue;
+ fpl->fp[nr_files] = get_file(file);
+ unix_inflight(fpl->user, fpl->fp[nr_files]);
+ nr_files++;
}
- fpl->max = fpl->count = nr;
- UNIXCB(skb).fp = fpl;
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_queue_head(&sk->sk_receive_queue, skb);
+ if (nr_files) {
+ fpl->max = SCM_MAX_FD;
+ fpl->count = nr_files;
+ UNIXCB(skb).fp = fpl;
+ skb->destructor = io_destruct_skb;
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_queue_head(&sk->sk_receive_queue, skb);
- for (i = 0; i < nr; i++)
- fput(fpl->fp[i]);
+ for (i = 0; i < nr_files; i++)
+ fput(fpl->fp[i]);
+ } else {
+ kfree_skb(skb);
+ kfree(fpl);
+ }
return 0;
}
@@ -3000,7 +3576,10 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
return 0;
while (total < ctx->nr_user_files) {
- fput(ctx->user_files[total]);
+ struct file *file = io_file_from_index(ctx, total);
+
+ if (file)
+ fput(file);
total++;
}
@@ -3013,33 +3592,79 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
+static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
+ unsigned nr_files)
+{
+ int i;
+
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_file_table *table = &ctx->file_table[i];
+ unsigned this_files;
+
+ this_files = min(nr_files, IORING_MAX_FILES_TABLE);
+ table->files = kcalloc(this_files, sizeof(struct file *),
+ GFP_KERNEL);
+ if (!table->files)
+ break;
+ nr_files -= this_files;
+ }
+
+ if (i == nr_tables)
+ return 0;
+
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_file_table *table = &ctx->file_table[i];
+ kfree(table->files);
+ }
+ return 1;
+}
+
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
__s32 __user *fds = (__s32 __user *) arg;
+ unsigned nr_tables;
int fd, ret = 0;
unsigned i;
- if (ctx->user_files)
+ if (ctx->file_table)
return -EBUSY;
if (!nr_args)
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
- ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
- if (!ctx->user_files)
+ nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
+ ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
+ GFP_KERNEL);
+ if (!ctx->file_table)
return -ENOMEM;
- for (i = 0; i < nr_args; i++) {
+ if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
+ kfree(ctx->file_table);
+ ctx->file_table = NULL;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+ struct fixed_file_table *table;
+ unsigned index;
+
ret = -EFAULT;
if (copy_from_user(&fd, &fds[i], sizeof(fd)))
break;
+ /* allow sparse sets */
+ if (fd == -1) {
+ ret = 0;
+ continue;
+ }
- ctx->user_files[i] = fget(fd);
+ table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+ index = i & IORING_FILE_TABLE_MASK;
+ table->files[index] = fget(fd);
ret = -EBADF;
- if (!ctx->user_files[i])
+ if (!table->files[index])
break;
/*
* Don't allow io_uring instances to be registered. If UNIX
@@ -3048,20 +3673,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
* handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway.
*/
- if (ctx->user_files[i]->f_op == &io_uring_fops) {
- fput(ctx->user_files[i]);
+ if (table->files[index]->f_op == &io_uring_fops) {
+ fput(table->files[index]);
break;
}
- ctx->nr_user_files++;
ret = 0;
}
if (ret) {
- for (i = 0; i < ctx->nr_user_files; i++)
- fput(ctx->user_files[i]);
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct file *file;
+
+ file = io_file_from_index(ctx, i);
+ if (file)
+ fput(file);
+ }
+ for (i = 0; i < nr_tables; i++)
+ kfree(ctx->file_table[i].files);
- kfree(ctx->user_files);
- ctx->user_files = NULL;
+ kfree(ctx->file_table);
+ ctx->file_table = NULL;
ctx->nr_user_files = 0;
return ret;
}
@@ -3073,9 +3704,201 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
+{
+#if defined(CONFIG_UNIX)
+ struct file *file = io_file_from_index(ctx, index);
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff_head list, *head = &sock->sk_receive_queue;
+ struct sk_buff *skb;
+ int i;
+
+ __skb_queue_head_init(&list);
+
+ /*
+ * Find the skb that holds this file in its SCM_RIGHTS. When found,
+ * remove this entry and rearrange the file array.
+ */
+ skb = skb_dequeue(head);
+ while (skb) {
+ struct scm_fp_list *fp;
+
+ fp = UNIXCB(skb).fp;
+ for (i = 0; i < fp->count; i++) {
+ int left;
+
+ if (fp->fp[i] != file)
+ continue;
+
+ unix_notinflight(fp->user, fp->fp[i]);
+ left = fp->count - 1 - i;
+ if (left) {
+ memmove(&fp->fp[i], &fp->fp[i + 1],
+ left * sizeof(struct file *));
+ }
+ fp->count--;
+ if (!fp->count) {
+ kfree_skb(skb);
+ skb = NULL;
+ } else {
+ __skb_queue_tail(&list, skb);
+ }
+ fput(file);
+ file = NULL;
+ break;
+ }
+
+ if (!file)
+ break;
+
+ __skb_queue_tail(&list, skb);
+
+ skb = skb_dequeue(head);
+ }
+
+ if (skb_peek(&list)) {
+ spin_lock_irq(&head->lock);
+ while ((skb = __skb_dequeue(&list)) != NULL)
+ __skb_queue_tail(head, skb);
+ spin_unlock_irq(&head->lock);
+ }
+#else
+ fput(io_file_from_index(ctx, index));
+#endif
+}
+
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
+ int index)
+{
+#if defined(CONFIG_UNIX)
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff_head *head = &sock->sk_receive_queue;
+ struct sk_buff *skb;
+
+ /*
+ * See if we can merge this file into an existing skb SCM_RIGHTS
+ * file set. If there's no room, fall back to allocating a new skb
+ * and filling it in.
+ */
+ spin_lock_irq(&head->lock);
+ skb = skb_peek(head);
+ if (skb) {
+ struct scm_fp_list *fpl = UNIXCB(skb).fp;
+
+ if (fpl->count < SCM_MAX_FD) {
+ __skb_unlink(skb, head);
+ spin_unlock_irq(&head->lock);
+ fpl->fp[fpl->count] = get_file(file);
+ unix_inflight(fpl->user, fpl->fp[fpl->count]);
+ fpl->count++;
+ spin_lock_irq(&head->lock);
+ __skb_queue_head(head, skb);
+ } else {
+ skb = NULL;
+ }
+ }
+ spin_unlock_irq(&head->lock);
+
+ if (skb) {
+ fput(file);
+ return 0;
+ }
+
+ return __io_sqe_files_scm(ctx, 1, index);
+#else
+ return 0;
+#endif
+}
+
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_files_update up;
+ __s32 __user *fds;
+ int fd, i, err;
+ __u32 done;
+
+ if (!ctx->file_table)
+ return -ENXIO;
+ if (!nr_args)
+ return -EINVAL;
+ if (copy_from_user(&up, arg, sizeof(up)))
+ return -EFAULT;
+ if (check_add_overflow(up.offset, nr_args, &done))
+ return -EOVERFLOW;
+ if (done > ctx->nr_user_files)
+ return -EINVAL;
+
+ done = 0;
+ fds = (__s32 __user *) up.fds;
+ while (nr_args) {
+ struct fixed_file_table *table;
+ unsigned index;
+
+ err = 0;
+ if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+ err = -EFAULT;
+ break;
+ }
+ i = array_index_nospec(up.offset, ctx->nr_user_files);
+ table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+ index = i & IORING_FILE_TABLE_MASK;
+ if (table->files[index]) {
+ io_sqe_file_unregister(ctx, i);
+ table->files[index] = NULL;
+ }
+ if (fd != -1) {
+ struct file *file;
+
+ file = fget(fd);
+ if (!file) {
+ err = -EBADF;
+ break;
+ }
+ /*
+ * Don't allow io_uring instances to be registered. If
+ * UNIX isn't enabled, then this causes a reference
+ * cycle and this instance can never get freed. If UNIX
+ * is enabled we'll handle it just fine, but there's
+ * still no point in allowing a ring fd as it doesn't
+ * support regular read/write anyway.
+ */
+ if (file->f_op == &io_uring_fops) {
+ fput(file);
+ err = -EBADF;
+ break;
+ }
+ table->files[index] = file;
+ err = io_sqe_file_register(ctx, file, i);
+ if (err)
+ break;
+ }
+ nr_args--;
+ done++;
+ up.offset++;
+ }
+
+ return done ? done : err;
+}
+
+static void io_put_work(struct io_wq_work *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ io_put_req(req);
+}
+
+static void io_get_work(struct io_wq_work *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ refcount_inc(&req->refs);
+}
+
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
+ unsigned concurrency;
int ret;
init_waitqueue_head(&ctx->sqo_wait);
@@ -3119,26 +3942,13 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err;
}
- /* Do QD, or 2 * CPUS, whatever is smallest */
- ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
- WQ_UNBOUND | WQ_FREEZABLE,
- min(ctx->sq_entries - 1, 2 * num_online_cpus()));
- if (!ctx->sqo_wq[0]) {
- ret = -ENOMEM;
- goto err;
- }
-
- /*
- * This is for buffered writes, where we want to limit the parallelism
- * due to file locking in file systems. As "normal" buffered writes
- * should parellelize on writeout quite nicely, limit us to having 2
- * pending. This avoids massive contention on the inode when doing
- * buffered async writes.
- */
- ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
- WQ_UNBOUND | WQ_FREEZABLE, 2);
- if (!ctx->sqo_wq[1]) {
- ret = -ENOMEM;
+ /* Do QD, or 4 * CPUS, whatever is smallest */
+ concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
+ ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user,
+ io_get_work, io_put_work);
+ if (IS_ERR(ctx->io_wq)) {
+ ret = PTR_ERR(ctx->io_wq);
+ ctx->io_wq = NULL;
goto err;
}
@@ -3484,6 +4294,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_unaccount_mem(ctx->user,
ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
+ kfree(ctx->completions);
+ kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx);
}
@@ -3522,8 +4334,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_kill_timeouts(ctx);
io_poll_remove_all(ctx);
+
+ if (ctx->io_wq)
+ io_wq_cancel_all(ctx->io_wq);
+
io_iopoll_reap_events(ctx);
- wait_for_completion(&ctx->ctx_done);
+ /* if we failed setting up the ctx, we might not have any rings */
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+ wait_for_completion(&ctx->completions[0]);
io_ring_ctx_free(ctx);
}
@@ -3536,6 +4355,53 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct io_kiocb *req;
+ DEFINE_WAIT(wait);
+
+ while (!list_empty_careful(&ctx->inflight_list)) {
+ struct io_kiocb *cancel_req = NULL;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
+ if (req->work.files != files)
+ continue;
+ /* req is being completed, ignore */
+ if (!refcount_inc_not_zero(&req->refs))
+ continue;
+ cancel_req = req;
+ break;
+ }
+ if (cancel_req)
+ prepare_to_wait(&ctx->inflight_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&ctx->inflight_lock);
+
+ /* We need to keep going until we don't find a matching req */
+ if (!cancel_req)
+ break;
+
+ io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+ io_put_req(cancel_req);
+ schedule();
+ }
+ finish_wait(&ctx->inflight_wait, &wait);
+}
+
+static int io_uring_flush(struct file *file, void *data)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ io_uring_cancel_files(ctx, data);
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
+ io_cqring_overflow_flush(ctx, true);
+ io_wq_cancel_all(ctx->io_wq);
+ }
+ return 0;
+}
+
static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
@@ -3597,25 +4463,20 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (!list_empty_careful(&ctx->cq_overflow_list))
+ io_cqring_overflow_flush(ctx, false);
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
- bool block_for_last = false;
+ struct mm_struct *cur_mm;
to_submit = min(to_submit, ctx->sq_entries);
-
- /*
- * Allow last submission to block in a series, IFF the caller
- * asked to wait for events and we don't currently have
- * enough. This potentially avoids an async punt.
- */
- if (to_submit == min_complete &&
- io_cqring_events(ctx->rings) < min_complete)
- block_for_last = true;
-
mutex_lock(&ctx->uring_lock);
- submitted = io_ring_submit(ctx, to_submit, block_for_last);
+ /* already have mm, so io_submit_sqes() won't try to grab it */
+ cur_mm = ctx->sqo_mm;
+ submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
+ &cur_mm, false);
mutex_unlock(&ctx->uring_lock);
}
if (flags & IORING_ENTER_GETEVENTS) {
@@ -3630,7 +4491,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
}
- io_ring_drop_ctx_refs(ctx, 1);
+ percpu_ref_put(&ctx->refs);
out_fput:
fdput(f);
return submitted ? submitted : ret;
@@ -3638,6 +4499,7 @@ out_fput:
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
+ .flush = io_uring_flush,
.mmap = io_uring_mmap,
.poll = io_uring_poll,
.fasync = io_uring_fasync,
@@ -3737,10 +4599,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
* Use twice as many entries for the CQ ring. It's possible for the
* application to drive a higher depth than the size of the SQ ring,
* since the sqes are only used at submission time. This allows for
- * some flexibility in overcommitting a bit.
+ * some flexibility in overcommitting a bit. If the application has
+ * set IORING_SETUP_CQSIZE, it will have passed in the desired number
+ * of CQ ring entries manually.
*/
p->sq_entries = roundup_pow_of_two(entries);
- p->cq_entries = 2 * p->sq_entries;
+ if (p->flags & IORING_SETUP_CQSIZE) {
+ /*
+ * If IORING_SETUP_CQSIZE is set, we do the same roundup
+ * to a power-of-two, if it isn't already. We do NOT impose
+ * any cq vs sq ring sizing.
+ */
+ if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
+ return -EINVAL;
+ p->cq_entries = roundup_pow_of_two(p->cq_entries);
+ } else {
+ p->cq_entries = 2 * p->sq_entries;
+ }
user = get_uid(current_user());
account_mem = !capable(CAP_IPC_LOCK);
@@ -3774,10 +4649,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
if (ret)
goto err;
- ret = io_uring_get_fd(ctx);
- if (ret < 0)
- goto err;
-
memset(&p->sq_off, 0, sizeof(p->sq_off));
p->sq_off.head = offsetof(struct io_rings, sq.head);
p->sq_off.tail = offsetof(struct io_rings, sq.tail);
@@ -3795,7 +4666,16 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
p->cq_off.cqes = offsetof(struct io_rings, cqes);
- p->features = IORING_FEAT_SINGLE_MMAP;
+ /*
+ * Install ring fd as the very last thing, so we don't risk someone
+ * having closed it before we finish setup
+ */
+ ret = io_uring_get_fd(ctx);
+ if (ret < 0)
+ goto err;
+
+ p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP;
+ trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
io_ring_ctx_wait_and_kill(ctx);
@@ -3821,7 +4701,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
}
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
- IORING_SETUP_SQ_AFF))
+ IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
return -EINVAL;
ret = io_uring_create(entries, &p);
@@ -3865,7 +4745,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* no new references will come in after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
- wait_for_completion(&ctx->ctx_done);
+ wait_for_completion(&ctx->completions[0]);
mutex_lock(&ctx->uring_lock);
switch (opcode) {
@@ -3887,6 +4767,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_sqe_files_unregister(ctx);
break;
+ case IORING_REGISTER_FILES_UPDATE:
+ ret = io_sqe_files_update(ctx, arg, nr_args);
+ break;
case IORING_REGISTER_EVENTFD:
ret = -EINVAL;
if (nr_args != 1)
@@ -3905,7 +4788,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
}
/* bring the ctx back to life */
- reinit_completion(&ctx->ctx_done);
+ reinit_completion(&ctx->completions[0]);
percpu_ref_reinit(&ctx->refs);
return ret;
}
@@ -3930,6 +4813,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
+ ctx->cq_ev_fd != NULL, ret);
out_fput:
fdput(f);
return ret;
diff --git a/fs/libfs.c b/fs/libfs.c
index c9b2850c0f7c..1463b038ffc4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,58 +89,45 @@ int dcache_dir_close(struct inode *inode, struct file *file)
EXPORT_SYMBOL(dcache_dir_close);
/* parent is locked at least shared */
-static struct dentry *next_positive(struct dentry *parent,
- struct list_head *from,
- int count)
+/*
+ * Returns an element of siblings' list.
+ * We are looking for <count>th positive after <p>; if
+ * found, dentry is grabbed and returned to caller.
+ * If no such element exists, NULL is returned.
+ */
+static struct dentry *scan_positives(struct dentry *cursor,
+ struct list_head *p,
+ loff_t count,
+ struct dentry *last)
{
- unsigned *seq = &parent->d_inode->i_dir_seq, n;
- struct dentry *res;
- struct list_head *p;
- bool skipped;
- int i;
+ struct dentry *dentry = cursor->d_parent, *found = NULL;
-retry:
- i = count;
- skipped = false;
- n = smp_load_acquire(seq) & ~1;
- res = NULL;
- rcu_read_lock();
- for (p = from->next; p != &parent->d_subdirs; p = p->next) {
+ spin_lock(&dentry->d_lock);
+ while ((p = p->next) != &dentry->d_subdirs) {
struct dentry *d = list_entry(p, struct dentry, d_child);
- if (!simple_positive(d)) {
- skipped = true;
- } else if (!--i) {
- res = d;
- break;
+ // we must at least skip cursors, to avoid livelocks
+ if (d->d_flags & DCACHE_DENTRY_CURSOR)
+ continue;
+ if (simple_positive(d) && !--count) {
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(d))
+ found = dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ if (likely(found))
+ break;
+ count = 1;
+ }
+ if (need_resched()) {
+ list_move(&cursor->d_child, p);
+ p = &cursor->d_child;
+ spin_unlock(&dentry->d_lock);
+ cond_resched();
+ spin_lock(&dentry->d_lock);
}
}
- rcu_read_unlock();
- if (skipped) {
- smp_rmb();
- if (unlikely(*seq != n))
- goto retry;
- }
- return res;
-}
-
-static void move_cursor(struct dentry *cursor, struct list_head *after)
-{
- struct dentry *parent = cursor->d_parent;
- unsigned n, *seq = &parent->d_inode->i_dir_seq;
- spin_lock(&parent->d_lock);
- for (;;) {
- n = *seq;
- if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
- break;
- cpu_relax();
- }
- __list_del(cursor->d_child.prev, cursor->d_child.next);
- if (after)
- list_add(&cursor->d_child, after);
- else
- list_add_tail(&cursor->d_child, &parent->d_subdirs);
- smp_store_release(seq, n + 2);
- spin_unlock(&parent->d_lock);
+ spin_unlock(&dentry->d_lock);
+ dput(last);
+ return found;
}
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
@@ -158,17 +145,25 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
if (offset != file->f_pos) {
+ struct dentry *cursor = file->private_data;
+ struct dentry *to = NULL;
+
+ inode_lock_shared(dentry->d_inode);
+
+ if (offset > 2)
+ to = scan_positives(cursor, &dentry->d_subdirs,
+ offset - 2, NULL);
+ spin_lock(&dentry->d_lock);
+ if (to)
+ list_move(&cursor->d_child, &to->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(to);
+
file->f_pos = offset;
- if (file->f_pos >= 2) {
- struct dentry *cursor = file->private_data;
- struct dentry *to;
- loff_t n = file->f_pos - 2;
-
- inode_lock_shared(dentry->d_inode);
- to = next_positive(dentry, &dentry->d_subdirs, n);
- move_cursor(cursor, to ? &to->d_child : NULL);
- inode_unlock_shared(dentry->d_inode);
- }
+
+ inode_unlock_shared(dentry->d_inode);
}
return offset;
}
@@ -190,25 +185,35 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *p = &cursor->d_child;
- struct dentry *next;
- bool moved = false;
+ struct list_head *anchor = &dentry->d_subdirs;
+ struct dentry *next = NULL;
+ struct list_head *p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
- p = &dentry->d_subdirs;
- while ((next = next_positive(dentry, p, 1)) != NULL) {
+ p = anchor;
+ else if (!list_empty(&cursor->d_child))
+ p = &cursor->d_child;
+ else
+ return 0;
+
+ while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
d_inode(next)->i_ino, dt_type(d_inode(next))))
break;
- moved = true;
- p = &next->d_child;
ctx->pos++;
+ p = &next->d_child;
}
- if (moved)
- move_cursor(cursor, p);
+ spin_lock(&dentry->d_lock);
+ if (next)
+ list_move_tail(&cursor->d_child, &next->d_child);
+ else
+ list_del_init(&cursor->d_child);
+ spin_unlock(&dentry->d_lock);
+ dput(next);
+
return 0;
}
EXPORT_SYMBOL(dcache_readdir);
@@ -468,8 +473,7 @@ EXPORT_SYMBOL(simple_write_begin);
/**
* simple_write_end - .write_end helper for non-block-device FSes
- * @available: See .write_end of address_space_operations
- * @file: "
+ * @file: See .write_end of address_space_operations
* @mapping: "
* @pos: "
* @len: "
diff --git a/fs/namespace.c b/fs/namespace.c
index fe0e9e1410fe..2adfe7b166a3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2478,8 +2478,10 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
time64_to_tm(sb->s_time_max, 0, &tm);
- pr_warn("Mounted %s file system at %s supports timestamps until %04ld (0x%llx)\n",
- sb->s_type->name, mntpath,
+ pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
+ sb->s_type->name,
+ is_mounted(mnt) ? "remounted" : "mounted",
+ mntpath,
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
free_page((unsigned long)buf);
@@ -2764,14 +2766,11 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
if (IS_ERR(mnt))
return PTR_ERR(mnt);
- error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
- if (error < 0) {
- mntput(mnt);
- return error;
- }
-
mnt_warn_timestamp_expiry(mountpoint, mnt);
+ error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+ if (error < 0)
+ mntput(mnt);
return error;
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 071b90a45933..af549d70ec50 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -53,6 +53,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
return false;
}
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, 0))
+ return delegation;
+ return NULL;
+}
+
static int
nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
{
@@ -1181,7 +1191,7 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
if (delegation != NULL &&
nfs4_stateid_match_other(dst, &delegation->stateid)) {
dst->seqid = delegation->stateid.seqid;
- return ret;
+ ret = true;
}
rcu_read_unlock();
out:
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9eb87ae4c982..8b14d441e699 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -68,6 +68,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 222d7115db71..040a50fd9bf3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -64,13 +64,6 @@
static struct kmem_cache *nfs_direct_cachep;
-/*
- * This represents a set of asynchronous requests that we're waiting on
- */
-struct nfs_direct_mirror {
- ssize_t count;
-};
-
struct nfs_direct_req {
struct kref kref; /* release manager */
@@ -84,9 +77,6 @@ struct nfs_direct_req {
atomic_t io_count; /* i/os we're waiting for */
spinlock_t lock; /* protect completion state */
- struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
- int mirror_count;
-
loff_t io_start; /* Start offset for I/O */
ssize_t count, /* bytes actually processed */
max_count, /* max expected count */
@@ -123,32 +113,42 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
}
static void
-nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr,
+ ssize_t dreq_len)
{
- int i;
- ssize_t count;
+ if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
+ test_bit(NFS_IOHDR_EOF, &hdr->flags)))
+ return;
+ if (dreq->max_count >= dreq_len) {
+ dreq->max_count = dreq_len;
+ if (dreq->count > dreq_len)
+ dreq->count = dreq_len;
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
+ dreq->error = hdr->error;
+ else /* Clear outstanding error if this is EOF */
+ dreq->error = 0;
+ }
+}
- WARN_ON_ONCE(dreq->count >= dreq->max_count);
+static void
+nfs_direct_count_bytes(struct nfs_direct_req *dreq,
+ const struct nfs_pgio_header *hdr)
+{
+ loff_t hdr_end = hdr->io_start + hdr->good_bytes;
+ ssize_t dreq_len = 0;
- if (dreq->mirror_count == 1) {
- dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
- dreq->count += hdr->good_bytes;
- } else {
- /* mirrored writes */
- count = dreq->mirrors[hdr->pgio_mirror_idx].count;
- if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
- count = hdr->io_start + hdr->good_bytes - dreq->io_start;
- dreq->mirrors[hdr->pgio_mirror_idx].count = count;
- }
- /* update the dreq->count by finding the minimum agreed count from all
- * mirrors */
- count = dreq->mirrors[0].count;
+ if (hdr_end > dreq->io_start)
+ dreq_len = hdr_end - dreq->io_start;
- for (i = 1; i < dreq->mirror_count; i++)
- count = min(count, dreq->mirrors[i].count);
+ nfs_direct_handle_truncated(dreq, hdr, dreq_len);
- dreq->count = count;
- }
+ if (dreq_len > dreq->max_count)
+ dreq_len = dreq->max_count;
+
+ if (dreq->count < dreq_len)
+ dreq->count = dreq_len;
}
/*
@@ -293,18 +293,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
cinfo->completion_ops = &nfs_direct_commit_completion_ops;
}
-static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
- struct nfs_pageio_descriptor *pgio,
- struct nfs_page *req)
-{
- int mirror_count = 1;
-
- if (pgio->pg_ops->pg_get_mirror_count)
- mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
-
- dreq->mirror_count = mirror_count;
-}
-
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
struct nfs_direct_req *dreq;
@@ -319,7 +307,6 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
- dreq->mirror_count = 1;
spin_lock_init(&dreq->lock);
return dreq;
@@ -402,20 +389,12 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
struct nfs_direct_req *dreq = hdr->dreq;
spin_lock(&dreq->lock);
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
spin_unlock(&dreq->lock);
goto out_put;
}
- if (hdr->good_bytes != 0)
- nfs_direct_good_bytes(dreq, hdr);
-
- if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
- dreq->error = 0;
-
+ nfs_direct_count_bytes(dreq, hdr);
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
@@ -646,29 +625,22 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
LIST_HEAD(reqs);
struct nfs_commit_info cinfo;
LIST_HEAD(failed);
- int i;
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ dreq->max_count = 0;
+ list_for_each_entry(req, &reqs, wb_list)
+ dreq->max_count += req->wb_bytes;
dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
- for (i = 0; i < dreq->mirror_count; i++)
- dreq->mirrors[i].count = 0;
get_dreq(dreq);
nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
- req = nfs_list_entry(reqs.next);
- nfs_direct_setup_mirroring(dreq, &desc, req);
- if (desc.pg_error < 0) {
- list_splice_init(&reqs, &failed);
- goto out_failed;
- }
-
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
/* Bump the transmission count */
req->wb_nio++;
@@ -686,7 +658,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
}
nfs_pageio_complete(&desc);
-out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -791,17 +762,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
-
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
spin_unlock(&dreq->lock);
goto out_put;
}
+ nfs_direct_count_bytes(dreq, hdr);
if (hdr->good_bytes != 0) {
- nfs_direct_good_bytes(dreq, hdr);
if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
request_commit = true;
@@ -923,7 +890,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
break;
}
- nfs_direct_setup_mirroring(dreq, &desc, req);
if (desc.pg_error < 0) {
nfs_free_request(req);
result = desc.pg_error;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 11eafcfc490b..caacf5e7f5e1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1440,8 +1440,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
return 0;
if ((delegation->type & fmode) != fmode)
return 0;
- if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
- return 0;
switch (claim) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_FH:
@@ -1810,7 +1808,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo
static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
{
struct nfs4_state *state = opendata->state;
- struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
@@ -1827,7 +1824,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
}
spin_unlock(&state->owner->so_lock);
rcu_read_lock();
- delegation = rcu_dereference(nfsi->delegation);
+ delegation = nfs4_get_valid_delegation(state->inode);
if (!can_open_delegated(delegation, fmode, claim)) {
rcu_read_unlock();
break;
@@ -2371,7 +2368,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
data->o_arg.open_flags, claim))
goto out_no_action;
rcu_read_lock();
- delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
+ delegation = nfs4_get_valid_delegation(data->state->inode);
if (can_open_delegated(delegation, data->o_arg.fmode, claim))
goto unlock_no_action;
rcu_read_unlock();
@@ -6106,6 +6103,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
status = nfs4_call_sync_custom(&task_setup_data);
if (setclientid.sc_cred) {
+ kfree(clp->cl_acceptor);
clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
put_rpccred(setclientid.sc_cred);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 85ca49549b39..52cab65f91cf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -786,7 +786,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *head;
- atomic_long_dec(&nfsi->nrequests);
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
head = req->wb_head;
@@ -799,8 +798,10 @@ static void nfs_inode_remove_request(struct nfs_page *req)
spin_unlock(&mapping->private_lock);
}
- if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
nfs_release_request(req);
+ atomic_long_dec(&nfsi->nrequests);
+ }
}
static void
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8de1c9d644f6..9cd0a6815933 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2049,7 +2049,8 @@ out_write_size:
inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2146,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2236,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2e982db3e1ae..9876db52913a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1230,6 +1230,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
if (IS_ERR(transfer_to[USRQUOTA])) {
status = PTR_ERR(transfer_to[USRQUOTA]);
+ transfer_to[USRQUOTA] = NULL;
goto bail_unlock;
}
}
@@ -1239,6 +1240,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
if (IS_ERR(transfer_to[GRPQUOTA])) {
status = PTR_ERR(transfer_to[GRPQUOTA]);
+ transfer_to[GRPQUOTA] = NULL;
goto bail_unlock;
}
}
@@ -2096,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
return 0;
}
-static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
- struct file *file,
- loff_t pos, size_t count,
- int *meta_level)
+static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int overwrite_io,
+ int write_sem,
+ int wait)
{
- int ret;
- struct buffer_head *di_bh = NULL;
- u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
- u32 clusters =
- ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ int ret = 0;
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, NULL, meta_level);
+ else
+ ret = ocfs2_try_inode_lock(inode,
+ overwrite_io ? NULL : di_bh, meta_level);
+ if (ret < 0)
goto out;
+
+ if (wait) {
+ if (write_sem)
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ if (write_sem)
+ ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (!ret) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
}
- *meta_level = 1;
+ return ret;
- ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
- if (ret)
- mlog_errno(ret);
+out_unlock:
+ brelse(*di_bh);
+ ocfs2_inode_unlock(inode, meta_level);
out:
- brelse(di_bh);
return ret;
}
+static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int write_sem)
+{
+ if (write_sem)
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(*di_bh);
+ *di_bh = NULL;
+
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
+}
+
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos, size_t count, int wait)
{
int ret = 0, meta_level = 0, overwrite_io = 0;
+ int write_sem = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
struct buffer_head *di_bh = NULL;
+ u32 cpos;
+ u32 clusters;
/*
* We start with a read level meta lock and only jump to an ex
* if we need to make modifications here.
*/
for(;;) {
- if (wait)
- ret = ocfs2_inode_lock(inode, NULL, meta_level);
- else
- ret = ocfs2_try_inode_lock(inode,
- overwrite_io ? NULL : &di_bh, meta_level);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ overwrite_io,
+ write_sem,
+ wait);
if (ret < 0) {
- meta_level = -1;
if (ret != -EAGAIN)
mlog_errno(ret);
goto out;
@@ -2154,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
*/
if (!wait && !overwrite_io) {
overwrite_io = 1;
- if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
- ret = -EAGAIN;
- goto out_unlock;
- }
ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
- brelse(di_bh);
- di_bh = NULL;
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0) {
if (ret != -EAGAIN)
mlog_errno(ret);
@@ -2181,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* set inode->i_size at the end of a write. */
if (should_remove_suid(dentry)) {
if (meta_level == 0) {
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
meta_level = 1;
continue;
}
@@ -2195,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
- ocfs2_inode_unlock(inode, meta_level);
- meta_level = -1;
-
- ret = ocfs2_prepare_inode_for_refcount(inode,
- file,
- pos,
- count,
- &meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ overwrite_io,
+ 1,
+ wait);
+ write_sem = 1;
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
}
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out_unlock;
}
@@ -2217,10 +2265,10 @@ out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
pos, count, wait);
- brelse(di_bh);
-
- if (meta_level >= 0)
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
out:
return ret;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d6f7b299eb23..efeea208fdeb 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
if (inode_alloc)
inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 930e3d388579..699a560efbb0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -217,7 +217,8 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(osb->ocfs2_wq);
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 158e5af767fd..720e9f94957e 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -377,7 +377,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(osb->ocfs2_wq);
+ if (osb->ocfs2_wq)
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
diff --git a/fs/open.c b/fs/open.c
index b62f5c0923a8..5c68282ea79e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -771,10 +771,6 @@ static int do_dentry_open(struct file *f,
f->f_mode |= FMODE_WRITER;
}
- /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
- f->f_mode |= FMODE_ATOMIC_POS;
-
f->f_op = fops_get(inode->i_fop);
if (WARN_ON(!f->f_op)) {
error = -ENODEV;
@@ -1256,7 +1252,7 @@ EXPORT_SYMBOL(nonseekable_open);
*/
int stream_open(struct inode *inode, struct file *filp)
{
- filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
filp->f_mode |= FMODE_STREAM;
return 0;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 8a2ab2f974bd..a9149199e0e7 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -793,6 +793,8 @@ int create_pipe_files(struct file **res, int flags)
}
res[0]->private_data = inode->i_pipe;
res[1] = f;
+ stream_open(inode, res[0]);
+ stream_open(inode, res[1]);
return 0;
}
@@ -931,9 +933,9 @@ static int fifo_open(struct inode *inode, struct file *filp)
__pipe_lock(pipe);
/* We can only do regular read/write on fifos */
- filp->f_mode &= (FMODE_READ | FMODE_WRITE);
+ stream_open(inode, filp);
- switch (filp->f_mode) {
+ switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
case FMODE_READ:
/*
* O_RDONLY
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index ac9247371871..8c1f1bb1a5ce 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -132,9 +132,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
show_val_kb(m, "ShmemPmdMapped: ",
global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
- show_val_kb(m, "FileHugePages: ",
+ show_val_kb(m, "FileHugePages: ",
global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
- show_val_kb(m, "FilePmdMapped: ",
+ show_val_kb(m, "FilePmdMapped: ",
global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
#endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 544d1ee15aee..7c952ee732e6 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -42,10 +42,12 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
+
if (!ppage || PageSlab(ppage) || page_has_type(ppage))
pcount = 0;
else
@@ -216,10 +218,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
if (put_user(stable_page_flags(ppage), out)) {
ret = -EFAULT;
@@ -261,10 +264,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
return -EINVAL;
while (count > 0) {
- if (pfn_valid(pfn))
- ppage = pfn_to_page(pfn);
- else
- ppage = NULL;
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
if (ppage)
ino = page_cgroup_ino(ppage);
diff --git a/fs/readdir.c b/fs/readdir.c
index 19bea591c3f1..d26d5ea4de7b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -27,53 +27,13 @@
/*
* Note the "unsafe_put_user() semantics: we goto a
* label for errors.
- *
- * Also note how we use a "while()" loop here, even though
- * only the biggest size needs to loop. The compiler (well,
- * at least gcc) is smart enough to turn the smaller sizes
- * into just if-statements, and this way we don't need to
- * care whether 'u64' or 'u32' is the biggest size.
- */
-#define unsafe_copy_loop(dst, src, len, type, label) \
- while (len >= sizeof(type)) { \
- unsafe_put_user(get_unaligned((type *)src), \
- (type __user *)dst, label); \
- dst += sizeof(type); \
- src += sizeof(type); \
- len -= sizeof(type); \
- }
-
-/*
- * We avoid doing 64-bit copies on 32-bit architectures. They
- * might be better, but the component names are mostly small,
- * and the 64-bit cases can end up being much more complex and
- * put much more register pressure on the code, so it's likely
- * not worth the pain of unaligned accesses etc.
- *
- * So limit the copies to "unsigned long" size. I did verify
- * that at least the x86-32 case is ok without this limiting,
- * but I worry about random other legacy 32-bit cases that
- * might not do as well.
- */
-#define unsafe_copy_type(dst, src, len, type, label) do { \
- if (sizeof(type) <= sizeof(unsigned long)) \
- unsafe_copy_loop(dst, src, len, type, label); \
-} while (0)
-
-/*
- * Copy the dirent name to user space, and NUL-terminate
- * it. This should not be a function call, since we're doing
- * the copy inside a "user_access_begin/end()" section.
*/
#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \
char __user *dst = (_dst); \
const char *src = (_src); \
size_t len = (_len); \
- unsafe_copy_type(dst, src, len, u64, label); \
- unsafe_copy_type(dst, src, len, u32, label); \
- unsafe_copy_type(dst, src, len, u16, label); \
- unsafe_copy_type(dst, src, len, u8, label); \
- unsafe_put_user(0, dst, label); \
+ unsafe_put_user(0, dst+len, label); \
+ unsafe_copy_to_user(dst, src, len, label); \
} while (0)
@@ -145,9 +105,9 @@ EXPORT_SYMBOL(iterate_dir);
*/
static int verify_dirent_name(const char *name, int len)
{
- if (WARN_ON_ONCE(!len))
+ if (!len)
return -EIO;
- if (WARN_ON_ONCE(memchr(name, '/', len)))
+ if (memchr(name, '/', len))
return -EIO;
return 0;
}
diff --git a/fs/super.c b/fs/super.c
index f627b7c53d2b..cfadab2cbf35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1300,6 +1300,7 @@ int get_tree_bdev(struct fs_context *fc,
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ blkdev_put(bdev, mode);
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
return -EBUSY;
}
@@ -1308,8 +1309,10 @@ int get_tree_bdev(struct fs_context *fc,
fc->sget_key = bdev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s))
+ if (IS_ERR(s)) {
+ blkdev_put(bdev, mode);
return PTR_ERR(s);
+ }
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 9fc14e38927f..0caa151cae4e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -16,11 +16,11 @@
#include <linux/namei.h>
#include <linux/tracefs.h>
#include <linux/fsnotify.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
-#include <linux/security.h>
#define TRACEFS_DEFAULT_MODE 0700
@@ -28,25 +28,6 @@ static struct vfsmount *tracefs_mount;
static int tracefs_mount_count;
static bool tracefs_registered;
-static int default_open_file(struct inode *inode, struct file *filp)
-{
- struct dentry *dentry = filp->f_path.dentry;
- struct file_operations *real_fops;
- int ret;
-
- if (!dentry)
- return -EINVAL;
-
- ret = security_locked_down(LOCKDOWN_TRACEFS);
- if (ret)
- return ret;
-
- real_fops = dentry->d_fsdata;
- if (!real_fops->open)
- return 0;
- return real_fops->open(inode, filp);
-}
-
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -241,12 +222,6 @@ static int tracefs_apply_options(struct super_block *sb)
return 0;
}
-static void tracefs_destroy_inode(struct inode *inode)
-{
- if (S_ISREG(inode->i_mode))
- kfree(inode->i_fop);
-}
-
static int tracefs_remount(struct super_block *sb, int *flags, char *data)
{
int err;
@@ -283,7 +258,6 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root)
static const struct super_operations tracefs_super_operations = {
.statfs = simple_statfs,
.remount_fs = tracefs_remount,
- .destroy_inode = tracefs_destroy_inode,
.show_options = tracefs_show_options,
};
@@ -414,10 +388,12 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
- struct file_operations *proxy_fops;
struct dentry *dentry;
struct inode *inode;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!(mode & S_IFMT))
mode |= S_IFREG;
BUG_ON(!S_ISREG(mode));
@@ -430,20 +406,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return failed_creating(dentry);
- proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL);
- if (unlikely(!proxy_fops)) {
- iput(inode);
- return failed_creating(dentry);
- }
-
- if (!fops)
- fops = &tracefs_file_operations;
-
- dentry->d_fsdata = (void *)fops;
- memcpy(proxy_fops, fops, sizeof(*proxy_fops));
- proxy_fops->open = default_open_file;
inode->i_mode = mode;
- inode->i_fop = proxy_fops;
+ inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
d_instantiate(dentry, inode);
fsnotify_create(dentry->d_parent->d_inode, dentry);
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5de296b34ab1..14fbdf22b7e7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -28,12 +28,11 @@ xfs_get_aghdr_buf(
struct xfs_mount *mp,
xfs_daddr_t blkno,
size_t numblks,
- int flags,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
if (!bp)
return NULL;
@@ -345,7 +344,7 @@ xfs_ag_init_hdr(
{
struct xfs_buf *bp;
- bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+ bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
if (!bp)
return -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b9f019603d0b..f0089e862216 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -826,32 +826,17 @@ xfs_attr_shortform_to_leaf(
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
- if (error) {
- /*
- * If we hit an IO error middle of the transaction inside
- * grow_inode(), we may have inconsistent data. Bail out.
- */
- if (error == -EIO)
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
ASSERT(blkno == 0);
error = xfs_attr3_leaf_create(args, blkno, &bp);
- if (error) {
- /* xfs_attr3_leaf_create may not have instantiated a block */
- if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4edc25a2ba80..02469d59c787 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -792,6 +792,7 @@ out_root_realloc:
*/
void
xfs_bmap_local_to_extents_empty(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork)
{
@@ -808,6 +809,7 @@ xfs_bmap_local_to_extents_empty(
ifp->if_u1.if_root = NULL;
ifp->if_height = 0;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -840,7 +842,7 @@ xfs_bmap_local_to_extents(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags = XFS_ILOG_CORE;
goto done;
}
@@ -887,7 +889,7 @@ xfs_bmap_local_to_extents(
/* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
ifp->if_u1.if_root = NULL;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 5bb446d80542..e2798c6f3a5f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -182,7 +182,8 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
-void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9595ced393dc..49e4bc39e7bb 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_to_block(
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 39dd2b908106..e9371a8e0e26 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -366,11 +366,11 @@ struct xfs_bulkstat {
uint64_t bs_blocks; /* number of blocks */
uint64_t bs_xflags; /* extended flags */
- uint64_t bs_atime; /* access time, seconds */
- uint64_t bs_mtime; /* modify time, seconds */
+ int64_t bs_atime; /* access time, seconds */
+ int64_t bs_mtime; /* modify time, seconds */
- uint64_t bs_ctime; /* inode change time, seconds */
- uint64_t bs_btime; /* creation time, seconds */
+ int64_t bs_ctime; /* inode change time, seconds */
+ int64_t bs_btime; /* creation time, seconds */
uint32_t bs_gen; /* generation count */
uint32_t bs_uid; /* user id */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 93b3793bc5b3..0cab11a5d390 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -341,7 +341,6 @@ xchk_refcountbt_rec(
xfs_extlen_t len;
xfs_nlink_t refcount;
bool has_cowflag;
- int error = 0;
bno = be32_to_cpu(rec->refc.rc_startblock);
len = be32_to_cpu(rec->refc.rc_blockcount);
@@ -366,7 +365,7 @@ xchk_refcountbt_rec(
xchk_refcountbt_xref(bs->sc, bno, len, refcount);
- return error;
+ return 0;
}
/* Make sure we have as many refc blocks as the rmap says. */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0910cb75b65d..4f443703065e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -864,6 +864,7 @@ xfs_alloc_file_space(
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
+ xfs_fileoff_t endoffset_fsb;
int nimaps;
int quota_flag;
int rt;
@@ -891,7 +892,8 @@ xfs_alloc_file_space(
imapp = &imaps[0];
nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
- allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+ endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+ allocatesize_fsb = endoffset_fsb - startoffset_fsb;
/*
* Allocate file space until done or until there is an error
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 21c243622a79..0abba171aa89 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -345,6 +345,15 @@ xfs_buf_allocate_memory(
unsigned short page_count, i;
xfs_off_t start, end;
int error;
+ xfs_km_flags_t kmflag_mask = 0;
+
+ /*
+ * assure zeroed buffer for non-read cases.
+ */
+ if (!(flags & XBF_READ)) {
+ kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
+ }
/*
* for buffers that are contained within a single page, just allocate
@@ -354,7 +363,8 @@ xfs_buf_allocate_memory(
size = BBTOB(bp->b_length);
if (size < PAGE_SIZE) {
int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
- bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
+ bp->b_addr = kmem_alloc_io(size, align_mask,
+ KM_NOFS | kmflag_mask);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a2beee9f74da..641d07f30a27 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1443,7 +1443,7 @@ xlog_alloc_log(
prev_iclog = iclog;
iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
- KM_MAYFAIL);
+ KM_MAYFAIL | KM_ZERO);
if (!iclog->ic_data)
goto out_free_iclog;
#ifdef DEBUG
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 508319039dce..c1a514ffff55 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -127,7 +127,7 @@ xlog_alloc_buffer(
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
}
/*