aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/inode.c14
-rw-r--r--fs/btrfs/btrfs_inode.h11
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/disk-io.c23
-rw-r--r--fs/btrfs/extent-tree.c8
-rw-r--r--fs/btrfs/extent_io.c44
-rw-r--r--fs/btrfs/inode.c33
-rw-r--r--fs/btrfs/ioctl.c12
-rw-r--r--fs/btrfs/props.c59
-rw-r--r--fs/btrfs/props.h4
-rw-r--r--fs/btrfs/scrub.c26
-rw-r--r--fs/btrfs/sysfs.c3
-rw-r--r--fs/btrfs/tree-log.c54
-rw-r--r--fs/btrfs/volumes.c15
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/btrfs/xattr.c11
-rw-r--r--fs/btrfs/zoned.c37
-rw-r--r--fs/btrfs/zoned.h4
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/caps.c7
-rw-r--r--fs/ceph/file.c16
-rw-r--r--fs/ceph/mds_client.c6
-rw-r--r--fs/cifs/connect.c11
-rw-r--r--fs/cifs/dfs_cache.c19
-rw-r--r--fs/cifs/smb2ops.c8
-rw-r--r--fs/cifs/transport.c3
-rw-r--r--fs/direct-io.c32
-rw-r--r--fs/erofs/zdata.c12
-rw-r--r--fs/erofs/zdata.h2
-rw-r--r--fs/exfat/file.c5
-rw-r--r--fs/exfat/super.c10
-rw-r--r--fs/ext4/ext4.h7
-rw-r--r--fs/ext4/extents.c32
-rw-r--r--fs/ext4/inode.c18
-rw-r--r--fs/ext4/ioctl.c26
-rw-r--r--fs/ext4/mballoc.c10
-rw-r--r--fs/ext4/namei.c4
-rw-r--r--fs/ext4/page-io.c4
-rw-r--r--fs/ext4/super.c50
-rw-r--r--fs/f2fs/checkpoint.c6
-rw-r--r--fs/f2fs/data.c33
-rw-r--r--fs/f2fs/f2fs.h12
-rw-r--r--fs/f2fs/file.c19
-rw-r--r--fs/f2fs/inode.c3
-rw-r--r--fs/f2fs/segment.c103
-rw-r--r--fs/f2fs/super.c32
-rw-r--r--fs/fat/file.c5
-rw-r--r--fs/fat/inode.c10
-rw-r--r--fs/fs-writeback.c17
-rw-r--r--fs/gfs2/bmap.c11
-rw-r--r--fs/gfs2/file.c143
-rw-r--r--fs/gfs2/rgrp.c7
-rw-r--r--fs/hugetlbfs/inode.c9
-rw-r--r--fs/internal.h29
-rw-r--r--fs/io-wq.c4
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c3584
-rw-r--r--fs/iomap/direct-io.c10
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c9
-rw-r--r--fs/jfs/ioctl.c5
-rw-r--r--fs/jfs/super.c8
-rw-r--r--fs/kernfs/dir.c7
-rw-r--r--fs/ksmbd/misc.c40
-rw-r--r--fs/ksmbd/misc.h3
-rw-r--r--fs/ksmbd/oplock.c30
-rw-r--r--fs/ksmbd/oplock.h2
-rw-r--r--fs/ksmbd/smb2pdu.c34
-rw-r--r--fs/ksmbd/vfs.c6
-rw-r--r--fs/ksmbd/vfs_cache.c2
-rw-r--r--fs/ksmbd/vfs_cache.h1
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/nfs/fs_context.c2
-rw-r--r--fs/nfs/nfs4proc.c12
-rw-r--r--fs/nilfs2/ioctl.c6
-rw-r--r--fs/nilfs2/sufile.c4
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/notify/fanotify/fanotify_user.c13
-rw-r--r--fs/ntfs3/file.c6
-rw-r--r--fs/ntfs3/super.c10
-rw-r--r--fs/ocfs2/ioctl.c5
-rw-r--r--fs/pipe.c9
-rw-r--r--fs/posix_acl.c10
-rw-r--r--fs/proc/cpuinfo.c6
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/squashfs/block.c20
-rw-r--r--fs/super.c2
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/xattr.c143
-rw-r--r--fs/xfs/xfs_buf.c6
-rw-r--r--fs/xfs/xfs_buf.h42
-rw-r--r--fs/xfs/xfs_discard.c8
-rw-r--r--fs/xfs/xfs_inode.c24
-rw-r--r--fs/xfs/xfs_log_cil.c2
-rw-r--r--fs/xfs/xfs_super.c12
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/zonefs/Makefile2
-rw-r--r--fs/zonefs/super.c227
-rw-r--r--fs/zonefs/sysfs.c139
-rw-r--r--fs/zonefs/zonefs.h18
101 files changed, 3740 insertions, 1894 deletions
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 2fe402483ad5..30b066299d39 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -740,10 +740,22 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
- int seq = 0;
+ struct key *key;
+ int ret, seq = 0;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+ if (!(query_flags & AT_STATX_DONT_SYNC) &&
+ !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ ret = afs_validate(vnode, key);
+ key_put(key);
+ if (ret < 0)
+ return ret;
+ }
+
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(&init_user_ns, inode, stat);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 47e72d72f7d0..32131a5d321b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -384,6 +384,17 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
return ret;
}
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
+
struct btrfs_dio_private {
struct inode *inode;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7631b88426e..077c95e9baa5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1060,6 +1060,7 @@ struct btrfs_fs_info {
*/
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
u64 nr_global_roots;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 71fd99b48283..f26202621989 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -734,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* Commit dev_replace state and reserve 1 item for it. */
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 126f244cdf88..84795d831282 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3157,6 +3157,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -3657,6 +3658,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
+ /*
+ * V1 space cache has some hardcoded PAGE_SIZE usage, and is
+ * going to be deprecated.
+ *
+ * Force to use v2 cache for subpage case.
+ */
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
+ "forcing free space tree for sector size %u with page size %lu",
+ sectorsize, PAGE_SIZE);
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
@@ -4226,6 +4238,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
*/
static void btrfs_end_empty_barrier(struct bio *bio)
{
+ bio_uninit(bio);
complete(bio->bi_private);
}
@@ -4235,7 +4248,7 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
/*
@@ -4248,12 +4261,12 @@ static void write_dev_flush(struct btrfs_device *device)
* of simplicity, since this is a debug tool and not meant for use in
* non-debug builds.
*/
- struct request_queue *q = bdev_get_queue(device->bdev);
- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ if (!bdev_write_cache(device->bdev))
return;
#endif
- bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
+ bio_init(bio, device->bdev, NULL, 0,
+ REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
@@ -4267,7 +4280,7 @@ static void write_dev_flush(struct btrfs_device *device)
*/
static blk_status_t wait_dev_flush(struct btrfs_device *device)
{
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6aa92f84f465..6260784e74b5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1239,7 +1239,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (size) {
ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += size;
else if (ret != -EOPNOTSUPP)
@@ -1256,7 +1256,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (bytes_left) {
ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += bytes_left;
}
@@ -1291,7 +1291,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
&discarded);
discarded += src_disc;
- } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
+ } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
} else {
ret = 0;
@@ -5987,7 +5987,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
*trimmed = 0;
/* Discard not supported = nothing to do. */
- if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ if (!bdev_max_discard_sectors(device->bdev))
return 0;
/* Not writable = nothing to do. */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 724e8fe06aa0..33c19f51d79b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2658,6 +2658,7 @@ int btrfs_repair_one_sector(struct inode *inode,
repair_bio = btrfs_bio_alloc(1);
repair_bbio = btrfs_bio(repair_bio);
+ repair_bbio->file_offset = start;
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
@@ -3333,24 +3334,37 @@ static int alloc_new_bio(struct btrfs_inode *inode,
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
- if (wbc) {
- struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_dev->bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- }
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *device;
+ if (wbc) {
+ /*
+ * For Zone append we need the correct block_device that we are
+ * going to write to set in the bio to be able to respect the
+ * hardware limitation. Look it up here:
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto error;
+ }
- device = btrfs_zoned_get_device(fs_info, disk_bytenr,
- fs_info->sectorsize);
- if (IS_ERR(device)) {
- ret = PTR_ERR(device);
- goto error;
+ bio_set_dev(bio, dev->bdev);
+ } else {
+ /*
+ * Otherwise pick the last added device to support
+ * cgroup writeback. For multi-device file systems this
+ * means blk-cgroup policies have to always be set on the
+ * last added/replaced device. This is a bit odd but has
+ * been like that for a long time.
+ */
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
}
-
- btrfs_bio(bio)->device = device;
+ wbc_init_bio(wbc, bio);
+ } else {
+ ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
}
return 0;
error:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5082b9c70f8c..95c499b8424e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -481,17 +481,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
}
/*
- * Check if the inode has flags compatible with compression
- */
-static inline bool inode_can_compress(struct btrfs_inode *inode)
-{
- if (inode->flags & BTRFS_INODE_NODATACOW ||
- inode->flags & BTRFS_INODE_NODATASUM)
- return false;
- return true;
-}
-
-/*
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
@@ -500,7 +489,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (!inode_can_compress(inode)) {
+ if (!btrfs_inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
btrfs_ino(inode));
@@ -2019,7 +2008,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
- } else if (!inode_can_compress(inode) ||
+ } else if (!btrfs_inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned)
ret = run_delalloc_zoned(inode, locked_page, start, end,
@@ -7810,8 +7799,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- const u64 orig_file_offset = dip->file_offset;
- u64 start = orig_file_offset;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
@@ -7821,6 +7808,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
for (i = 0; i < nr_sectors; i++) {
+ u64 start = bbio->file_offset + bio_offset;
+
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
(!csum || !check_data_csum(inode, bbio,
@@ -7833,17 +7822,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
} else {
int ret;
- ASSERT((start - orig_file_offset) < UINT_MAX);
- ret = btrfs_repair_one_sector(inode,
- &bbio->bio,
- start - orig_file_offset,
- bvec.bv_page, pgoff,
+ ret = btrfs_repair_one_sector(inode, &bbio->bio,
+ bio_offset, bvec.bv_page, pgoff,
start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
}
- start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
bio_offset += sectorsize;
pgoff += sectorsize;
@@ -7870,6 +7855,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t err = bio->bi_status;
if (err)
@@ -7880,12 +7866,12 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ)
- err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
+ err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
dip->dio_bio->bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
@@ -8046,6 +8032,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9bf0616a3069..b2c692b2fd8d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -468,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_device *device;
- struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
@@ -498,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
- if (!device->bdev)
+ if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
continue;
- q = bdev_get_queue(device->bdev);
- if (blk_queue_discard(q)) {
- num_devices++;
- minlen = min_t(u64, q->limits.discard_granularity,
- minlen);
- }
+ num_devices++;
+ minlen = min_t(u64, bdev_discard_granularity(device->bdev),
+ minlen);
}
rcu_read_unlock();
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 1a6d2d5b4b33..1b31481f9e72 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
struct prop_handler {
struct hlist_node node;
const char *xattr_name;
- int (*validate)(const char *value, size_t len);
+ int (*validate)(const struct btrfs_inode *inode, const char *value,
+ size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
const char *(*extract)(struct inode *inode);
+ bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -55,7 +57,8 @@ find_prop_handler(const char *name,
return NULL;
}
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len)
{
const struct prop_handler *handler;
@@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
if (value_len == 0)
return 0;
- return handler->validate(value, value_len);
+ return handler->validate(inode, value, value_len);
+}
+
+/*
+ * Check if a property should be ignored (not set) for an inode.
+ *
+ * @inode: The target inode.
+ * @name: The property's name.
+ *
+ * The caller must be sure the given property name is valid, for example by
+ * having previously called btrfs_validate_prop().
+ *
+ * Returns: true if the property should be ignored for the given inode
+ * false if the property must not be ignored for the given inode
+ */
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
+{
+ const struct prop_handler *handler;
+
+ handler = find_prop_handler(name, NULL);
+ ASSERT(handler != NULL);
+
+ return handler->ignore(inode);
}
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
@@ -252,8 +277,12 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
return ret;
}
-static int prop_compression_validate(const char *value, size_t len)
+static int prop_compression_validate(const struct btrfs_inode *inode,
+ const char *value, size_t len)
{
+ if (!btrfs_inode_can_compress(inode))
+ return -EINVAL;
+
if (!value)
return 0;
@@ -310,6 +339,22 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return 0;
}
+static bool prop_compression_ignore(const struct btrfs_inode *inode)
+{
+ /*
+ * Compression only has effect for regular files, and for directories
+ * we set it just to propagate it to new files created inside them.
+ * Everything else (symlinks, devices, sockets, fifos) is pointless as
+ * it will do nothing, so don't waste metadata space on a compression
+ * xattr for anything that is neither a file nor a directory.
+ */
+ if (!S_ISREG(inode->vfs_inode.i_mode) &&
+ !S_ISDIR(inode->vfs_inode.i_mode))
+ return true;
+
+ return false;
+}
+
static const char *prop_compression_extract(struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
@@ -330,6 +375,7 @@ static struct prop_handler prop_handlers[] = {
.validate = prop_compression_validate,
.apply = prop_compression_apply,
.extract = prop_compression_extract,
+ .ignore = prop_compression_ignore,
.inheritable = 1
},
};
@@ -356,6 +402,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
+ if (h->ignore(BTRFS_I(inode)))
+ continue;
+
value = h->extract(parent);
if (!value)
continue;
@@ -364,7 +413,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(value, strlen(value));
+ ret = h->validate(BTRFS_I(inode), value, strlen(value));
if (ret)
continue;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 40b2c65b518c..59bea741cfcf 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -13,7 +13,9 @@ void __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len);
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len);
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 11089568b287..8cd713d37ad2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3699,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ ASSERT(cache->start <= chunk_offset);
+ /*
+ * We are using the commit root to search for device extents, so
+ * that means we could have found a device extent item from a
+ * block group that was deleted in the current transaction. The
+ * logical start offset of the deleted block group, stored at
+ * @chunk_offset, might be part of the logical address range of
+ * a new block group (which uses different physical extents).
+ * In this case btrfs_lookup_block_group() has returned the new
+ * block group, and its start address is less than @chunk_offset.
+ *
+ * We skip such new block groups, because it's pointless to
+ * process them, as we won't find their extents because we search
+ * for them using the commit root of the extent tree. For a device
+ * replace it's also fine to skip it, we won't miss copying them
+ * to the target device because we have the write duplication
+ * setup through the regular write path (by btrfs_map_block()),
+ * and we have committed a transaction when we started the device
+ * replace, right after setting up the device replace state.
+ */
+ if (cache->start < chunk_offset) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
@@ -3822,7 +3847,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
- ASSERT(cache->start == chunk_offset);
ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
dev_extent_len);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 17389a42a3ab..ba78ca5aabbb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -922,6 +922,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
case BTRFS_EXCLOP_BALANCE:
str = "balance\n";
break;
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ str = "balance paused\n";
+ break;
case BTRFS_EXCLOP_DEV_ADD:
str = "device add\n";
break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 571dae8ad65e..e65633686378 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3188,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
goto out;
}
}
@@ -3720,11 +3721,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
key.offset = first_offset;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- if (ret)
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
@@ -3848,13 +3867,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
ret = insert_dir_log_key(trans, log, dst_path,
ino, *last_old_dentry_offset + 1,
key.offset - 1);
- /*
- * -EEXIST should never happen because when we
- * log a directory in full mode (LOG_INODE_ALL)
- * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from
- * the log tree.
- */
- ASSERT(ret != -EEXIST);
if (ret < 0)
return ret;
}
@@ -5804,6 +5816,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* Before logging the inode item, cache the value returned by
* inode_logged(), because after that we have the need to figure out if
* the inode was previously logged in this transaction.
@@ -6181,7 +6205,7 @@ again:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
log_mode, ctx);
@@ -7018,12 +7042,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
/*
* Other concurrent task might be logging the old directory,
* as it can be triggered when logging other inode that had or
- * still has a dentry in the old directory. So take the old
- * directory's log_mutex to prevent getting an -EEXIST when
- * logging a key to record the deletion, or having that other
- * task logging the old directory get an -EEXIST if it attempts
- * to log the same key after we just did it. In both cases that
- * would result in falling back to a transaction commit.
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
*/
mutex_lock(&old_dir->log_mutex);
ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a8cc736731fd..b6b00338037c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -405,7 +405,6 @@ void btrfs_free_device(struct btrfs_device *device)
WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
- bio_put(device->flush_bio);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -643,7 +642,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
device->bdev = bdev;
@@ -2706,7 +2705,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -6949,16 +6948,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (!dev)
return ERR_PTR(-ENOMEM);
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bd297f23d19e..b11c563d2025 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -121,8 +121,8 @@ struct btrfs_device {
/* bytes used on the current transaction */
u64 commit_bytes_used;
- /* for sending down flush barriers */
- struct bio *flush_bio;
+ /* Bio used for flushing device barriers */
+ struct bio flush_bio;
struct completion flush_wait;
/* per-device scrub information */
@@ -328,6 +328,9 @@ struct btrfs_fs_devices {
struct btrfs_bio {
unsigned int mirror_num;
+ /* for direct I/O */
+ u64 file_offset;
+
/* @device is for stripe IO submission. */
struct btrfs_device *device;
u8 *csum;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 99abf41b89b9..85691dc2232f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
out:
if (start_trans)
btrfs_end_transaction(trans);
@@ -403,10 +404,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct btrfs_root *root = BTRFS_I(inode)->root;
name = xattr_full_name(handler, name);
- ret = btrfs_validate_prop(name, value, size);
+ ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size);
if (ret)
return ret;
+ if (btrfs_ignore_prop(BTRFS_I(inode), name))
+ return 0;
+
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -416,7 +420,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1b1b310c3c51..29b54fd9c128 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -350,7 +350,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
- struct request_queue *queue = bdev_get_queue(bdev);
unsigned int max_active_zones;
unsigned int nactive;
sector_t nr_sectors;
@@ -410,7 +409,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = queue_max_active_zones(queue);
+ max_active_zones = bdev_max_active_zones(bdev);
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
btrfs_err_in_rcu(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
@@ -1835,6 +1834,12 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
+ /* No space left */
+ if (block_group->alloc_offset == block_group->zone_capacity) {
+ ret = false;
+ goto out_unlock;
+ }
+
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
@@ -1842,35 +1847,23 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
if (device->zone_info->max_active_zones == 0)
continue;
- /* No space left */
- if (block_group->alloc_offset == block_group->zone_capacity) {
- ret = false;
- goto out_unlock;
- }
-
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
goto out_unlock;
}
-
- /* Successfully activated all the zones */
- if (i == map->num_stripes - 1)
- block_group->zone_is_active = 1;
-
-
}
+
+ /* Successfully activated all the zones */
+ block_group->zone_is_active = 1;
spin_unlock(&block_group->lock);
- if (block_group->zone_is_active) {
- /* For the active block group list */
- btrfs_get_block_group(block_group);
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
- spin_lock(&fs_info->zone_active_bgs_lock);
- list_add_tail(&block_group->active_bg_list,
- &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
- }
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return true;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index cbf016a7bb5d..6dee76248cb4 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -359,7 +359,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_lock(&inode->vfs_inode, 0);
+ mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
}
static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
@@ -367,7 +367,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_unlock(&inode->vfs_inode, 0);
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}
#endif
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index aa25bffd4823..b6edcf89a429 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
if (folio_test_dirty(folio)) {
dout("%p dirty_folio %p idx %lu -- already dirty\n",
mapping->host, folio, folio->index);
- BUG_ON(!folio_get_private(folio));
+ VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
return false;
}
@@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
* Reference snap context in folio->private. Also set
* PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(folio_get_private(folio));
+ VM_BUG_ON_FOLIO(folio_test_private(folio), folio);
folio_attach_private(folio, snapc);
return ceph_fscache_dirty_folio(mapping, folio);
@@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
}
WARN_ON(!folio_test_locked(folio));
- if (folio_get_private(folio)) {
+ if (folio_test_private(folio)) {
dout("%p invalidate_folio idx %lu full dirty page\n",
inode, folio->index);
@@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
- if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+ pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
+ __func__, req->r_ops[i].op, req, i, req->r_tid);
break;
+ }
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f1ad6884d4da..5c14ef04e474 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2274,6 +2274,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_dirops,
r_unsafe_dir_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -2294,6 +2296,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_iops,
r_unsafe_target_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -3870,6 +3874,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
inode, ci, mds, mseq, target);
retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
@@ -3933,6 +3938,7 @@ retry:
}
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
/* open target session */
@@ -3958,6 +3964,7 @@ retry:
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
if (tsession) {
mutex_unlock(&tsession->s_mutex);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6c9e837aa1d3..8c8226c0feac 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
- iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
- iinfo.xattr_data = xattr_buf;
- memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
in.ino = cpu_to_le64(vino.ino);
in.snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
} else if (!d_in_lookup(dentry)) {
/* If it's not being looked up, it's negative */
return -ENOENT;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa38c013126d..00c3de177dd6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4434,8 +4434,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- struct ceph_fs_client *fsc = s->s_mdsc->fsc;
-
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
@@ -4444,10 +4442,6 @@ bool check_session_state(struct ceph_mds_session *s)
}
break;
case CEPH_MDS_SESSION_CLOSING:
- /* Should never reach this when not force unmounting */
- WARN_ON_ONCE(s->s_ttl &&
- READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
- fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
case CEPH_MDS_SESSION_CLOSED:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 902e8c6c0f9c..42e14f408856 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -534,12 +534,19 @@ int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
{
/* If tcp session is not an dfs connection, then reconnect to last target server */
spin_lock(&cifs_tcp_ses_lock);
- if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) {
+ if (!server->is_dfs_conn) {
spin_unlock(&cifs_tcp_ses_lock);
return __cifs_reconnect(server, mark_smb_session);
}
spin_unlock(&cifs_tcp_ses_lock);
+ mutex_lock(&server->refpath_lock);
+ if (!server->origin_fullpath || !server->leaf_fullpath) {
+ mutex_unlock(&server->refpath_lock);
+ return __cifs_reconnect(server, mark_smb_session);
+ }
+ mutex_unlock(&server->refpath_lock);
+
return reconnect_dfs_server(server);
}
#else
@@ -3675,9 +3682,11 @@ static void setup_server_referral_paths(struct mount_ctx *mnt_ctx)
{
struct TCP_Server_Info *server = mnt_ctx->server;
+ mutex_lock(&server->refpath_lock);
server->origin_fullpath = mnt_ctx->origin_fullpath;
server->leaf_fullpath = mnt_ctx->leaf_fullpath;
server->current_fullpath = mnt_ctx->leaf_fullpath;
+ mutex_unlock(&server->refpath_lock);
mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL;
}
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 30e040da4f09..956f8e5cf3e7 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1422,12 +1422,14 @@ static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool
struct TCP_Server_Info *server = tcon->ses->server;
mutex_lock(&server->refpath_lock);
- if (strcasecmp(server->leaf_fullpath, server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
+ }
mutex_unlock(&server->refpath_lock);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
-
return 0;
}
@@ -1530,11 +1532,14 @@ static void refresh_mounts(struct cifs_ses **sessions)
list_del_init(&tcon->ulist);
mutex_lock(&server->refpath_lock);
- if (strcasecmp(server->leaf_fullpath, server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
+ }
mutex_unlock(&server->refpath_lock);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
cifs_put_tcon(tcon);
}
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a67df8eaf702..d6aaeff4a30a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1858,9 +1858,17 @@ smb2_copychunk_range(const unsigned int xid,
int chunks_copied = 0;
bool chunk_sizes_updated = false;
ssize_t bytes_written, total_bytes_written = 0;
+ struct inode *inode;
pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+ /*
+ * We need to flush all unwritten data before we can send the
+ * copychunk ioctl to the server.
+ */
+ inode = d_inode(trgtfile->dentry);
+ filemap_write_and_wait(inode->i_mapping);
+
if (pcchunk == NULL)
return -ENOMEM;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index d9d1c353bafc..c667e6ddfe2f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -464,13 +464,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
return -EIO;
}
- tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS);
+ tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS);
if (!tr_hdr)
return -ENOMEM;
memset(&cur_rqst[0], 0, sizeof(cur_rqst));
memset(&iov, 0, sizeof(iov));
- memset(tr_hdr, 0, sizeof(*tr_hdr));
iov.iov_base = tr_hdr;
iov.iov_len = sizeof(*tr_hdr);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aef06e607b40..840752006f60 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1115,11 +1115,10 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important
* for the whole file.
*/
-static inline ssize_t
-do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter,
- get_block_t get_block, dio_iodone_t end_io,
- dio_submit_t submit_io, int flags)
+ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, struct iov_iter *iter,
+ get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
{
unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
@@ -1334,29 +1333,6 @@ fail_dio:
kmem_cache_free(dio_cache, dio);
return retval;
}
-
-ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter,
- get_block_t get_block,
- dio_iodone_t end_io, dio_submit_t submit_io,
- int flags)
-{
- /*
- * The block device state is needed in the end to finally
- * submit everything. Since it's likely to be cache cold
- * prefetch it here as first thing to hide some of the
- * latency.
- *
- * Attempt to prefetch the pieces we likely need later.
- */
- prefetch(&bdev->bd_disk->part_tbl);
- prefetch(bdev->bd_disk->queue);
- prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES);
-
- return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
- end_io, submit_io, flags);
-}
-
EXPORT_SYMBOL(__blockdev_direct_IO);
static __init int dio_init(void)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 0ed880f42525..e6dea6dfca16 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1066,12 +1066,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
/* wake up the caller thread for sync decompression */
if (sync) {
- unsigned long flags;
-
- spin_lock_irqsave(&io->u.wait.lock, flags);
if (!atomic_add_return(bios, &io->pending_bios))
- wake_up_locked(&io->u.wait);
- spin_unlock_irqrestore(&io->u.wait.lock, flags);
+ complete(&io->u.done);
+
return;
}
@@ -1217,7 +1214,7 @@ jobqueue_init(struct super_block *sb,
} else {
fg_out:
q = fgq;
- init_waitqueue_head(&fgq->u.wait);
+ init_completion(&fgq->u.done);
atomic_set(&fgq->pending_bios, 0);
}
q->sb = sb;
@@ -1419,8 +1416,7 @@ static void z_erofs_runqueue(struct super_block *sb,
return;
/* wait until all bios are completed */
- io_wait_event(io[JQ_SUBMIT].u.wait,
- !atomic_read(&io[JQ_SUBMIT].pending_bios));
+ wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index e043216b545f..800b11c53f57 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -97,7 +97,7 @@ struct z_erofs_decompressqueue {
z_erofs_next_pcluster_t head;
union {
- wait_queue_head_t wait;
+ struct completion done;
struct work_struct work;
} u;
};
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 2f5130059236..20d4e47f57ab 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -351,21 +351,20 @@ out:
static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
{
- struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(inode->i_sb->s_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range)))
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(inode->i_sb->s_bdev));
ret = exfat_trim_fs(inode, &range);
if (ret < 0)
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8ca21e7917d1..be0788ecaf20 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -627,13 +627,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
if (opts->allow_utime == (unsigned short)-1)
opts->allow_utime = ~opts->fs_dmask & 0022;
- if (opts->discard) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
- opts->discard = 0;
- }
+ if (opts->discard && !bdev_max_discard_sectors(sb->s_bdev)) {
+ exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
+ opts->discard = 0;
}
sb->s_flags |= SB_NODIRATIME;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3f87cca49f0c..a743b1e3b89e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2273,6 +2273,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
@@ -3032,7 +3036,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -3064,6 +3068,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
+int ext4_update_overhead(struct super_block *sb);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0d98cf402282..e473fde6b64b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4500,9 +4500,9 @@ retry:
return ret > 0 ? ret2 : ret;
}
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
@@ -4574,6 +4574,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4690,7 +4694,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(inode, offset, len);
+ ret = ext4_punch_hole(file, offset, len);
goto exit;
}
@@ -4699,12 +4703,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto exit;
if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(inode, offset, len);
+ ret = ext4_collapse_range(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(inode, offset, len);
+ ret = ext4_insert_range(file, offset, len);
goto exit;
}
@@ -4740,6 +4744,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if (ret)
goto out;
@@ -5241,8 +5249,9 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
@@ -5294,6 +5303,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5387,8 +5400,9 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
handle_t *handle;
@@ -5445,6 +5459,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 13740f2d0e61..646ece9b3455 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3953,12 +3953,14 @@ int ext4_break_layouts(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ loff_t first_block_offset, last_block_offset, max_length;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
unsigned int credits;
int ret = 0, ret2 = 0;
@@ -4001,6 +4003,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ /*
+ * For punch hole the length + offset needs to be within one block
+ * before last range. Adjust the length if it goes beyond that limit.
+ */
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+ if (offset + length > max_length)
+ length = max_length - offset;
+
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
@@ -4016,6 +4026,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 992229ca2d83..4d1d2326eee9 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1044,7 +1044,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
__u32 flags = 0;
unsigned int flush_flags = 0;
struct super_block *sb = file_inode(filp)->i_sb;
- struct request_queue *q;
if (copy_from_user(&flags, (__u32 __user *)arg,
sizeof(__u32)))
@@ -1065,10 +1064,8 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID)
return -EINVAL;
- q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev);
- if (!q)
- return -ENXIO;
- if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
return -EOPNOTSUPP;
if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
@@ -1393,14 +1390,13 @@ resizefs_out:
case FITRIM:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
/*
@@ -1652,3 +1648,19 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
+
+static void set_overhead(struct ext4_super_block *es, const void *arg)
+{
+ es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
+}
+
+int ext4_update_overhead(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sb_rdonly(sb) || sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))
+ return 0;
+
+ return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 252c168454c7..ea653d19f9ec 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3498,7 +3498,7 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&lg->lg_prealloc_lock);
}
- if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
+ if (bdev_nonrot(sb->s_bdev))
sbi->s_mb_max_linear_groups = 0;
else
sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
@@ -3629,7 +3629,7 @@ static inline int ext4_issue_discard(struct super_block *sb,
return __blkdev_issue_discard(sb->s_bdev,
(sector_t)discard_block << (sb->s_blocksize_bits - 9),
(sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, 0, biop);
+ GFP_NOFS, biop);
} else
return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
@@ -6455,7 +6455,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -6475,9 +6475,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
range->len < sb->s_blocksize)
return -EINVAL;
/* No point to try to trim less than discard granularity */
- if (range->minlen < q->limits.discard_granularity) {
+ if (range->minlen < discard_granularity) {
minlen = EXT4_NUM_B2C(EXT4_SB(sb),
- q->limits.discard_granularity >> sb->s_blocksize_bits);
+ discard_granularity >> sb->s_blocksize_bits);
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e37da8d5cd0c..767b4bfe39c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1466,10 +1466,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 495ce59fb4ad..14695e2b5042 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -134,8 +134,10 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 81749eaddf4c..6900da973ce2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1199,20 +1199,25 @@ static void ext4_put_super(struct super_block *sb)
int aborted = 0;
int i, err;
- ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
-
- flush_work(&sbi->s_error_work);
- destroy_workqueue(sbi->rsv_conversion_wq);
- ext4_release_orphan_info(sb);
-
/*
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
+ * Unregister sysfs before flush sbi->s_error_work.
+ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
+ * read metadata verify failed then will queue error work.
+ * flush_stashed_error_work will call start_this_handle may trigger
+ * BUG_ON.
*/
ext4_unregister_sysfs(sb);
+ ext4_unregister_li_request(sb);
+ ext4_quota_off_umount(sb);
+
+ flush_work(&sbi->s_error_work);
+ destroy_workqueue(sbi->rsv_conversion_wq);
+ ext4_release_orphan_info(sb);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
@@ -4172,9 +4177,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -5282,9 +5289,18 @@ no_journal:
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -5458,13 +5474,9 @@ no_journal:
goto failed_mount9;
}
- if (test_opt(sb, DISCARD)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- ext4_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -5602,6 +5614,8 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
"Quota mode: %s.", descr, ext4_quota_mode(sb));
+ /* Update the s_overhead_clusters if necessary */
+ ext4_update_overhead(sb);
return 0;
free_sbi:
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f5366feea82d..909085a78f9c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,9 +98,9 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
- if (page->index == sbi->metapage_eio_ofs &&
- sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) {
- set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ if (page->index == sbi->metapage_eio_ofs) {
+ if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO)
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
} else {
sbi->metapage_eio_ofs = page->index;
sbi->metapage_eio_cnt = 0;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8e0c2e773c8d..9a1a526f2092 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -388,11 +388,23 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
-static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag)
+static unsigned int f2fs_io_flags(struct f2fs_io_info *fio)
{
unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
- unsigned int fua_flag = io_flag & temp_mask;
- unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
+ unsigned int fua_flag, meta_flag, io_flag;
+ unsigned int op_flags = 0;
+
+ if (fio->op != REQ_OP_WRITE)
+ return 0;
+ if (fio->type == DATA)
+ io_flag = fio->sbi->data_io_flag;
+ else if (fio->type == NODE)
+ io_flag = fio->sbi->node_io_flag;
+ else
+ return 0;
+
+ fua_flag = io_flag & temp_mask;
+ meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
/*
* data/node io flag bits per temp:
@@ -401,9 +413,10 @@ static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag)
* Cold | Warm | Hot | Cold | Warm | Hot |
*/
if ((1 << fio->temp) & meta_flag)
- fio->op_flags |= REQ_META;
+ op_flags |= REQ_META;
if ((1 << fio->temp) & fua_flag)
- fio->op_flags |= REQ_FUA;
+ op_flags |= REQ_FUA;
+ return op_flags;
}
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
@@ -413,14 +426,10 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
sector_t sector;
struct bio *bio;
- if (fio->type == DATA)
- __attach_io_flag(fio, sbi->data_io_flag);
- else if (fio->type == NODE)
- __attach_io_flag(fio, sbi->node_io_flag);
-
bdev = f2fs_target_device(sbi, fio->new_blkaddr, &sector);
- bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags, GFP_NOIO,
- &f2fs_bioset);
+ bio = bio_alloc_bioset(bdev, npages,
+ fio->op | fio->op_flags | f2fs_io_flags(fio),
+ GFP_NOIO, &f2fs_bioset);
bio->bi_iter.bi_sector = sector;
if (is_read_io(fio->op)) {
bio->bi_end_io = f2fs_read_end_io;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cd1e65bcf0b0..2b2b3c87e45e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -154,7 +154,6 @@ struct f2fs_mount_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
/* For which write hints are passed down to block layer */
- int whint_mode;
int alloc_mode; /* segment allocation policy */
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
@@ -1334,12 +1333,6 @@ enum {
};
enum {
- WHINT_MODE_OFF, /* not pass down write hints */
- WHINT_MODE_USER, /* try to pass down hints given by users */
- WHINT_MODE_FS, /* pass down hints with F2FS policy */
-};
-
-enum {
ALLOC_MODE_DEFAULT, /* stay default */
ALLOC_MODE_REUSE, /* reuse segments as much as possible */
};
@@ -3657,8 +3650,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
void f2fs_destroy_segment_manager_caches(void);
int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp);
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
unsigned int segno);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
@@ -4381,8 +4372,7 @@ static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
static inline bool f2fs_bdev_support_discard(struct block_device *bdev)
{
- return blk_queue_discard(bdev_get_queue(bdev)) ||
- bdev_is_zoned(bdev);
+ return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev);
}
static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5b89af0f27f0..35b6c720c2bc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2285,7 +2285,6 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret;
@@ -2304,7 +2303,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
return ret;
range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
ret = f2fs_trim_fs(F2FS_SB(sb), &range);
mnt_drop_write_file(filp);
if (ret < 0)
@@ -3686,18 +3685,18 @@ out:
static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode,
pgoff_t off, block_t block, block_t len, u32 flags)
{
- struct request_queue *q = bdev_get_queue(bdev);
sector_t sector = SECTOR_FROM_BLOCK(block);
sector_t nr_sects = SECTOR_FROM_BLOCK(len);
int ret = 0;
- if (!q)
- return -ENXIO;
-
- if (flags & F2FS_TRIM_FILE_DISCARD)
- ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
- blk_queue_secure_erase(q) ?
- BLKDEV_DISCARD_SECURE : 0);
+ if (flags & F2FS_TRIM_FILE_DISCARD) {
+ if (bdev_max_secure_erase_sectors(bdev))
+ ret = blkdev_issue_secure_erase(bdev, sector, nr_sects,
+ GFP_NOFS);
+ else
+ ret = blkdev_issue_discard(bdev, sector, nr_sects,
+ GFP_NOFS);
+ }
if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) {
if (IS_ENCRYPTED(inode))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 71f232dcf3c2..83639238a1fe 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -550,7 +550,8 @@ make_now:
}
f2fs_set_inode_flags(inode);
- if (file_should_truncate(inode)) {
+ if (file_should_truncate(inode) &&
+ !is_sbi_flag_set(sbi, SBI_POR_DOING)) {
ret = f2fs_truncate(inode);
if (ret)
goto bad_inode;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 22dfeb991529..7225ce09f3ab 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1196,9 +1196,8 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
unsigned int *issued)
{
struct block_device *bdev = dc->bdev;
- struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_discard_blocks =
- SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
&(dcc->fstrim_list) : &(dcc->wait_list);
@@ -1245,7 +1244,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
err = __blkdev_issue_discard(bdev,
SECTOR_FROM_BLOCK(start),
SECTOR_FROM_BLOCK(len),
- GFP_NOFS, 0, &bio);
+ GFP_NOFS, &bio);
submit:
if (err) {
spin_lock_irqsave(&dc->lock, flags);
@@ -1375,9 +1374,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
struct discard_cmd *dc;
struct discard_info di = {0};
struct rb_node **insert_p = NULL, *insert_parent = NULL;
- struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_discard_blocks =
- SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
block_t end = lstart + len;
dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
@@ -3243,101 +3241,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
}
}
-/* This returns write hints for each segment type. This hints will be
- * passed down to block layer. There are mapping tables which depend on
- * the mount option 'whint_mode'.
- *
- * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
- *
- * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_NOT_SET
- * HOT_NODE "
- * WARM_NODE "
- * COLD_NODE "
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- *
- * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_MEDIUM;
- * HOT_NODE WRITE_LIFE_NOT_SET
- * WARM_NODE "
- * COLD_NODE WRITE_LIFE_NONE
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- */
-
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp)
-{
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_NOT_SET;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else {
- return WRITE_LIFE_NOT_SET;
- }
- } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_LONG;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else if (type == NODE) {
- if (temp == WARM || temp == HOT)
- return WRITE_LIFE_NOT_SET;
- else if (temp == COLD)
- return WRITE_LIFE_NONE;
- } else if (type == META) {
- return WRITE_LIFE_MEDIUM;
- }
- }
- return WRITE_LIFE_NOT_SET;
-}
-
static int __get_segment_type_2(struct f2fs_io_info *fio)
{
if (fio->type == DATA)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ea939db18f88..4368f90571bd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -138,7 +138,6 @@ enum {
Opt_jqfmt_vfsold,
Opt_jqfmt_vfsv0,
Opt_jqfmt_vfsv1,
- Opt_whint,
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
@@ -214,7 +213,6 @@ static match_table_t f2fs_tokens = {
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_whint, "whint_mode=%s"},
{Opt_alloc, "alloc_mode=%s"},
{Opt_fsync, "fsync_mode=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
@@ -975,22 +973,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
f2fs_info(sbi, "quota operations not supported");
break;
#endif
- case Opt_whint:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "user-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
- } else if (!strcmp(name, "off")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
- } else if (!strcmp(name, "fs-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
case Opt_alloc:
name = match_strdup(&args[0]);
if (!name)
@@ -1328,12 +1310,6 @@ default_check:
return -EINVAL;
}
- /* Not pass down write hints if the number of active logs is lesser
- * than NR_CURSEG_PERSIST_TYPE.
- */
- if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE)
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
-
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1978,10 +1954,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",prjquota");
#endif
f2fs_show_quota_options(seq, sbi->sb);
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER)
- seq_printf(seq, ",whint_mode=%s", "user-based");
- else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
- seq_printf(seq, ",whint_mode=%s", "fs-based");
fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb);
@@ -2033,7 +2005,6 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
@@ -2314,8 +2285,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & SB_RDONLY ||
- F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
+ if (*flags & SB_RDONLY) {
sync_inodes_sb(sb);
set_sbi_flag(sbi, SBI_IS_DIRTY);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a5a309fcc7fa..bf91f977debe 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -127,13 +127,12 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
struct super_block *sb = inode->i_sb;
struct fstrim_range __user *user_range;
struct fstrim_range range;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
user_range = (struct fstrim_range __user *)arg;
@@ -141,7 +140,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
err = fat_trim_fs(inode, &range);
if (err < 0)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index bf6051bdf1d1..3d1afb95a925 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1872,13 +1872,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
goto out_fail;
}
- if (sbi->options.discard) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- fat_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
+ if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev))
+ fat_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
fat_set_state(sb, 1, 0);
return 0;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591fe9cf1659..a1074a26e784 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode,
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
+ else if (!(inode->i_state & I_SYNC_QUEUED) &&
+ (inode->i_state & I_DIRTY))
+ redirty_tail_locked(inode, wb);
+
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -1775,11 +1779,12 @@ static long writeback_sb_inodes(struct super_block *sb,
};
unsigned long start_time = jiffies;
long write_chunk;
- long wrote = 0; /* count both pages and inodes */
+ long total_wrote = 0; /* count both pages and inodes */
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
+ long wrote;
if (inode->i_sb != sb) {
if (work->sb) {
@@ -1855,7 +1860,9 @@ static long writeback_sb_inodes(struct super_block *sb,
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
- wrote += write_chunk - wbc.nr_to_write;
+ wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
+ wrote = wrote < 0 ? 0 : wrote;
+ total_wrote += wrote;
if (need_resched()) {
/*
@@ -1877,7 +1884,7 @@ static long writeback_sb_inodes(struct super_block *sb,
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
- wrote++;
+ total_wrote++;
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
@@ -1891,14 +1898,14 @@ static long writeback_sb_inodes(struct super_block *sb,
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
*/
- if (wrote) {
+ if (total_wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
- return wrote;
+ return total_wrote;
}
static long __writeback_inodes_wb(struct bdi_writeback *wb,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 39080b2d6cf8..b6697333bb2b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (length != written && (iomap->flags & IOMAP_F_NEW)) {
/* Deallocate blocks that were just allocated. */
- loff_t blockmask = i_blocksize(inode) - 1;
- loff_t end = (pos + length) & ~blockmask;
+ loff_t hstart = round_up(pos + written, i_blocksize(inode));
+ loff_t hend = iomap->offset + iomap->length;
- pos = (pos + written + blockmask) & ~blockmask;
- if (pos < end) {
- truncate_pagecache_range(inode, pos, end - 1);
- punch_hole(ip, pos, end - pos);
+ if (hstart < hend) {
+ truncate_pagecache_range(inode, hstart, hend - 1);
+ punch_hole(ip, hstart, hend - hstart);
}
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 22b41acfbbc3..2556ae1f92ea 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -770,30 +770,27 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
-static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+static inline bool should_fault_in_pages(struct iov_iter *i,
+ struct kiocb *iocb,
size_t *prev_count,
size_t *window_size)
{
size_t count = iov_iter_count(i);
size_t size, offs;
- if (likely(!count))
- return false;
- if (ret <= 0 && ret != -EFAULT)
+ if (!count)
return false;
if (!iter_is_iovec(i))
return false;
size = PAGE_SIZE;
- offs = offset_in_page(i->iov[0].iov_base + i->iov_offset);
+ offs = offset_in_page(iocb->ki_pos);
if (*prev_count != count || !*window_size) {
size_t nr_dirtied;
- size = ALIGN(offs + count, PAGE_SIZE);
- size = min_t(size_t, size, SZ_1M);
nr_dirtied = max(current->nr_dirtied_pause -
current->nr_dirtied, 8);
- size = min(size, nr_dirtied << PAGE_SHIFT);
+ size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT);
}
*prev_count = count;
@@ -807,7 +804,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
size_t prev_count = 0, window_size = 0;
- size_t written = 0;
+ size_t read = 0;
ssize_t ret;
/*
@@ -835,35 +832,31 @@ retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, written);
+ IOMAP_DIO_PARTIAL, read);
to->nofault = false;
pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
if (ret > 0)
- written = ret;
-
- if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
- size_t leftover;
+ read = ret;
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_writeable(to, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
goto retry;
- }
}
+out_unlock:
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
if (ret < 0)
return ret;
- return written;
+ return read;
}
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -873,7 +866,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
size_t prev_count = 0, window_size = 0;
- size_t read = 0;
+ size_t written = 0;
ssize_t ret;
/*
@@ -899,41 +892,37 @@ retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
/* Silently fall back to buffered I/O when writing beyond EOF */
if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
- goto out;
+ goto out_unlock;
from->nofault = true;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, read);
+ IOMAP_DIO_PARTIAL, written);
from->nofault = false;
-
- if (ret == -ENOTBLK)
- ret = 0;
+ if (ret <= 0) {
+ if (ret == -ENOTBLK)
+ ret = 0;
+ if (ret != -EFAULT)
+ goto out_unlock;
+ }
if (ret > 0)
- read = ret;
-
- if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
- size_t leftover;
+ written = ret;
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_readable(from, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (window_size)
goto retry;
- }
}
-out:
+out_unlock:
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
if (ret < 0)
return ret;
- return read;
+ return written;
}
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -941,7 +930,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct gfs2_inode *ip;
struct gfs2_holder gh;
size_t prev_count = 0, window_size = 0;
- size_t written = 0;
+ size_t read = 0;
ssize_t ret;
/*
@@ -962,7 +951,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (ret >= 0) {
if (!iov_iter_count(to))
return ret;
- written = ret;
+ read = ret;
} else if (ret != -EFAULT) {
if (ret != -EAGAIN)
return ret;
@@ -975,32 +964,26 @@ retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
-retry_under_glock:
pagefault_disable();
ret = generic_file_read_iter(iocb, to);
pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
if (ret > 0)
- written += ret;
+ read += ret;
- if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
- size_t leftover;
-
- gfs2_holder_allow_demote(&gh);
- leftover = fault_in_iov_iter_writeable(to, window_size);
- gfs2_holder_disallow_demote(&gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(&gh))
- goto retry_under_glock;
- if (written)
- goto out_uninit;
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(&gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
goto retry;
- }
}
+out_unlock:
if (gfs2_holder_queued(&gh))
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
- return written ? written : ret;
+ return read ? read : ret;
}
static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
@@ -1014,7 +997,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
struct gfs2_holder *statfs_gh = NULL;
size_t prev_count = 0, window_size = 0;
size_t orig_count = iov_iter_count(from);
- size_t read = 0;
+ size_t written = 0;
ssize_t ret;
/*
@@ -1032,10 +1015,18 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
retry:
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (!window_size) {
+ ret = -EFAULT;
+ goto out_uninit;
+ }
+ from->count = min(from->count, window_size);
+ }
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
+
if (inode == sdp->sd_rindex) {
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -1052,27 +1043,19 @@ retry_under_glock:
current->backing_dev_info = NULL;
if (ret > 0) {
iocb->ki_pos += ret;
- read += ret;
+ written += ret;
}
if (inode == sdp->sd_rindex)
gfs2_glock_dq_uninit(statfs_gh);
- from->count = orig_count - read;
- if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
- size_t leftover;
-
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_readable(from, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- from->count = min(from->count, window_size - leftover);
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
- if (read && !(iocb->ki_flags & IOCB_DIRECT))
- goto out_uninit;
- goto retry;
- }
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
+
+ from->count = orig_count - written;
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ goto retry;
}
out_unlock:
if (gfs2_holder_queued(gh))
@@ -1081,8 +1064,8 @@ out_uninit:
gfs2_holder_uninit(gh);
if (statfs_gh)
kfree(statfs_gh);
- from->count = orig_count - read;
- return read ? read : ret;
+ from->count = orig_count - written;
+ return written ? written : ret;
}
/**
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 801ad9f4f2be..6d26bb525484 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1386,7 +1386,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
{
struct inode *inode = file_inode(filp);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
+ struct block_device *bdev = sdp->sd_vfs->s_bdev;
struct buffer_head *bh;
struct gfs2_rgrpd *rgd;
struct gfs2_rgrpd *rgd_end;
@@ -1405,7 +1405,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
return -EROFS;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (copy_from_user(&r, argp, sizeof(r)))
@@ -1418,8 +1418,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
start = r.start >> bs_shift;
end = start + (r.len >> bs_shift);
minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
- minlen = max_t(u64, minlen,
- q->limits.discard_granularity) >> bs_shift;
+ minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift;
if (end <= start || minlen > sdp->sd_max_rg_data)
return -EINVAL;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 99c7477cee5c..dd3a088db11d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
addr = vm_unmapped_area(&info);
}
@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/fs/internal.h b/fs/internal.h
index 08503dc68d2b..9a6c233ee7f1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -191,3 +191,32 @@ long splice_file_to_pipe(struct file *in,
struct pipe_inode_info *opipe,
loff_t *offset,
size_t len, unsigned int flags);
+
+/*
+ * fs/xattr.c:
+ */
+struct xattr_name {
+ char name[XATTR_NAME_MAX + 1];
+};
+
+struct xattr_ctx {
+ /* Value of attribute */
+ union {
+ const void __user *cvalue;
+ void __user *value;
+ };
+ void *kvalue;
+ size_t size;
+ /* Attribute name */
+ struct xattr_name *kname;
+ unsigned int flags;
+};
+
+
+ssize_t do_getxattr(struct user_namespace *mnt_userns,
+ struct dentry *d,
+ struct xattr_ctx *ctx);
+
+int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
+int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct xattr_ctx *ctx);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 32aeb2c581c5..824623bcf1a5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
- set_notify_signal(worker->task);
+ __set_notify_signal(worker->task);
wake_up_process(worker->task);
return false;
}
@@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
{
if (work && match->fn(work, match->data)) {
work->flags |= IO_WQ_WORK_CANCEL;
- set_notify_signal(worker->task);
+ __set_notify_signal(worker->task);
return true;
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index dbecd27656c7..ba6eee76d028 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
struct io_wq_work {
struct io_wq_work_node list;
unsigned flags;
+ int cancel_seq;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4479013854d2..9f1c682d7caf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -80,6 +80,7 @@
#include <linux/io_uring.h>
#include <linux/audit.h>
#include <linux/security.h>
+#include <linux/xattr.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -94,7 +95,7 @@
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
/* only define max */
-#define IORING_MAX_FIXED_FILES (1U << 15)
+#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -113,6 +114,11 @@
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
+#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
+ IO_REQ_CLEAN_FLAGS)
+
+#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
+
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
struct io_uring {
@@ -166,7 +172,7 @@ struct io_rings {
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
- u32 sq_flags;
+ atomic_t sq_flags;
/*
* Runtime CQ flags
*
@@ -198,13 +204,6 @@ struct io_rings {
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
-enum io_uring_cmd_flags {
- IO_URING_F_COMPLETE_DEFER = 1,
- IO_URING_F_UNLOCKED = 2,
- /* int's last bit, sign checks are usually faster than a bit test */
- IO_URING_F_NONBLOCK = INT_MIN,
-};
-
struct io_mapped_ubuf {
u64 ubuf;
u64 ubuf_end;
@@ -216,10 +215,27 @@ struct io_mapped_ubuf {
struct io_ring_ctx;
struct io_overflow_cqe {
- struct io_uring_cqe cqe;
struct list_head list;
+ struct io_uring_cqe cqe;
};
+/*
+ * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
+ * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
+ * can't safely always dereference the file when the task has exited and ring
+ * cleanup is done. If a file is tracked and part of SCM, then unix gc on
+ * process exit may reap it before __io_sqe_files_unregister() is run.
+ */
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#if defined(CONFIG_64BIT)
+#define FFS_SCM 0x4UL
+#else
+#define IO_URING_SCM_ALL
+#define FFS_SCM 0x0UL
+#endif
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
+
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
@@ -237,6 +253,8 @@ struct io_rsrc_put {
struct io_file_table {
struct io_fixed_file *files;
+ unsigned long *bitmap;
+ unsigned int alloc_hint;
};
struct io_rsrc_node {
@@ -261,10 +279,26 @@ struct io_rsrc_data {
bool quiesce;
};
+#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
struct io_buffer_list {
- struct list_head list;
- struct list_head buf_list;
+ /*
+ * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
+ * then these are classic provided buffers and ->buf_list is used.
+ */
+ union {
+ struct list_head buf_list;
+ struct {
+ struct page **buf_pages;
+ struct io_uring_buf_ring *buf_ring;
+ };
+ };
__u16 bgid;
+
+ /* below is for ring provided buffers */
+ __u16 buf_nr_pages;
+ __u16 nr_entries;
+ __u32 head;
+ __u32 mask;
};
struct io_buffer {
@@ -337,7 +371,7 @@ struct io_ev_fd {
struct rcu_head rcu;
};
-#define IO_BUFFERS_HASH_BITS 5
+#define BGID_ARRAY 64
struct io_ring_ctx {
/* const or read-mostly hot data */
@@ -346,6 +380,7 @@ struct io_ring_ctx {
struct io_rings *rings;
unsigned int flags;
+ enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
@@ -353,6 +388,7 @@ struct io_ring_ctx {
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
unsigned int has_evfd: 1;
+ unsigned int syscall_iopoll: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -382,17 +418,21 @@ struct io_ring_ctx {
*/
struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
+ atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
+
+ struct io_buffer_list *io_bl;
+ struct xarray io_bl_xa;
+ struct list_head io_buffers_cache;
+
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct list_head *io_buffers;
- struct list_head io_buffers_cache;
struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
@@ -409,9 +449,16 @@ struct io_ring_ctx {
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
- unsigned long check_cq_overflow;
+ unsigned long check_cq;
struct {
+ /*
+ * We cache a range of free CQEs we can use, once exhausted it
+ * should go through a slower range setup, see __io_get_cqe()
+ */
+ struct io_uring_cqe *cqe_cached;
+ struct io_uring_cqe *cqe_sentinel;
+
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
@@ -497,7 +544,7 @@ struct io_uring_task {
spinlock_t task_lock;
struct io_wq_work_list task_list;
- struct io_wq_work_list prior_task_list;
+ struct io_wq_work_list prio_task_list;
struct callback_head task_work;
struct file **registered_rings;
bool task_running;
@@ -546,6 +593,16 @@ struct io_accept {
unsigned long nofile;
};
+struct io_socket {
+ struct file *file;
+ int domain;
+ int type;
+ int protocol;
+ int flags;
+ u32 file_slot;
+ unsigned long nofile;
+};
+
struct io_sync {
struct file *file;
loff_t len;
@@ -557,6 +614,8 @@ struct io_sync {
struct io_cancel {
struct file *file;
u64 addr;
+ u32 flags;
+ s32 fd;
};
struct io_timeout {
@@ -585,7 +644,7 @@ struct io_rw {
struct kiocb kiocb;
u64 addr;
u32 len;
- u32 flags;
+ rwf_t flags;
};
struct io_connect {
@@ -602,9 +661,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
- int bgid;
size_t len;
size_t done_io;
+ unsigned int flags;
};
struct io_open {
@@ -722,6 +781,12 @@ struct io_msg {
u32 len;
};
+struct io_nop {
+ struct file *file;
+ u64 extra1;
+ u64 extra2;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -748,6 +813,12 @@ struct io_async_rw {
struct wait_page_queue wpq;
};
+struct io_xattr {
+ struct file *file;
+ struct xattr_ctx ctx;
+ struct filename *filename;
+};
+
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
@@ -766,6 +837,7 @@ enum {
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
+ REQ_F_BUFFER_RING_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_CREDS_BIT,
@@ -776,6 +848,7 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
+ REQ_F_APOLL_MULTISHOT_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -816,6 +889,8 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ /* buffer selected from ring, needs commit */
+ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
@@ -840,6 +915,8 @@ enum {
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
+ /* fast poll multishot mode */
+ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
};
struct async_poll {
@@ -862,6 +939,21 @@ enum {
IORING_RSRC_BUFFER = 1,
};
+struct io_cqe {
+ __u64 user_data;
+ __s32 res;
+ /* fd initially, then cflags for completion */
+ union {
+ __u32 flags;
+ int fd;
+ };
+};
+
+enum {
+ IO_CHECK_CQ_OVERFLOW_BIT,
+ IO_CHECK_CQ_DROPPED_BIT,
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -897,46 +989,65 @@ struct io_kiocb {
struct io_symlink symlink;
struct io_hardlink hardlink;
struct io_msg msg;
+ struct io_xattr xattr;
+ struct io_socket sock;
+ struct io_nop nop;
+ struct io_uring_cmd uring_cmd;
};
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
+ /*
+ * Can be either a fixed buffer index, or used with provided buffers.
+ * For the latter, before issue it points to the buffer group ID,
+ * and after selection it points to the buffer ID itself.
+ */
u16 buf_index;
unsigned int flags;
- u64 user_data;
- u32 result;
- /* fd initially, then cflags for completion */
- union {
- u32 cflags;
- int fd;
- };
+ struct io_cqe cqe;
struct io_ring_ctx *ctx;
struct task_struct *task;
- struct percpu_ref *fixed_rsrc_refs;
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ struct io_rsrc_node *rsrc_node;
+
+ union {
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
+
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+
+ /*
+ * stores buffer ID for ring provided buffers, valid IFF
+ * REQ_F_BUFFER_RING is set.
+ */
+ struct io_buffer_list *buf_list;
+ };
union {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
/* cache ->apoll->events */
- int apoll_events;
+ __poll_t apoll_events;
};
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
- struct hlist_node hash_node;
+ union {
+ struct hlist_node hash_node;
+ struct {
+ u64 extra1;
+ u64 extra2;
+ };
+ };
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
- struct io_buffer *kbuf;
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
struct io_kiocb *link;
/* custom credentials, valid IFF REQ_F_CREDS is set */
@@ -956,6 +1067,24 @@ struct io_defer_entry {
u32 seq;
};
+struct io_cancel_data {
+ struct io_ring_ctx *ctx;
+ union {
+ u64 data;
+ struct file *file;
+ };
+ u32 flags;
+ int seq;
+};
+
+/*
+ * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
+ * the following sqe if SQE128 is used.
+ */
+#define uring_cmd_pdu_size(is_sqe128) \
+ ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
+ offsetof(struct io_uring_sqe, cmd))
+
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
@@ -977,12 +1106,20 @@ struct io_op_def {
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
+ /* supports ioprio */
+ unsigned ioprio : 1;
+ /* supports iopoll */
+ unsigned iopoll : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
static const struct io_op_def io_op_defs[] = {
- [IORING_OP_NOP] = {},
+ [IORING_OP_NOP] = {
+ .audit_skip = 1,
+ .iopoll = 1,
+ .buffer_select = 1,
+ },
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -991,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -1001,6 +1140,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
@@ -1013,6 +1154,8 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -1022,6 +1165,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
@@ -1064,6 +1209,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.poll_exclusive = 1,
+ .ioprio = 1, /* used for flags */
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
@@ -1086,6 +1232,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_CLOSE] = {},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
@@ -1097,6 +1244,8 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1106,6 +1255,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
@@ -1140,9 +1291,11 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -1160,11 +1313,30 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_LINKAT] = {},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
+ .iopoll = 1,
+ },
+ [IORING_OP_FSETXATTR] = {
+ .needs_file = 1
+ },
+ [IORING_OP_SETXATTR] = {},
+ [IORING_OP_FGETXATTR] = {
+ .needs_file = 1
+ },
+ [IORING_OP_GETXATTR] = {},
+ [IORING_OP_SOCKET] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_URING_CMD] = {
+ .needs_file = 1,
+ .plug = 1,
+ .needs_async_setup = 1,
+ .async_size = uring_cmd_pdu_size(1),
},
};
/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
static bool io_disarm_next(struct io_kiocb *req);
static void io_uring_del_tctx_node(unsigned long index);
@@ -1173,10 +1345,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
-
-static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req);
+static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
static void io_dismantle_req(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
@@ -1185,10 +1354,10 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
static void io_clean_op(struct io_kiocb *req);
static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
-static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
static void io_drop_inflight_file(struct io_kiocb *req);
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
-static void __io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
@@ -1201,11 +1370,115 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
static void io_eventfd_signal(struct io_ring_ctx *ctx);
+static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
+const char *io_uring_get_opcode(u8 opcode)
+{
+ switch ((enum io_uring_op)opcode) {
+ case IORING_OP_NOP:
+ return "NOP";
+ case IORING_OP_READV:
+ return "READV";
+ case IORING_OP_WRITEV:
+ return "WRITEV";
+ case IORING_OP_FSYNC:
+ return "FSYNC";
+ case IORING_OP_READ_FIXED:
+ return "READ_FIXED";
+ case IORING_OP_WRITE_FIXED:
+ return "WRITE_FIXED";
+ case IORING_OP_POLL_ADD:
+ return "POLL_ADD";
+ case IORING_OP_POLL_REMOVE:
+ return "POLL_REMOVE";
+ case IORING_OP_SYNC_FILE_RANGE:
+ return "SYNC_FILE_RANGE";
+ case IORING_OP_SENDMSG:
+ return "SENDMSG";
+ case IORING_OP_RECVMSG:
+ return "RECVMSG";
+ case IORING_OP_TIMEOUT:
+ return "TIMEOUT";
+ case IORING_OP_TIMEOUT_REMOVE:
+ return "TIMEOUT_REMOVE";
+ case IORING_OP_ACCEPT:
+ return "ACCEPT";
+ case IORING_OP_ASYNC_CANCEL:
+ return "ASYNC_CANCEL";
+ case IORING_OP_LINK_TIMEOUT:
+ return "LINK_TIMEOUT";
+ case IORING_OP_CONNECT:
+ return "CONNECT";
+ case IORING_OP_FALLOCATE:
+ return "FALLOCATE";
+ case IORING_OP_OPENAT:
+ return "OPENAT";
+ case IORING_OP_CLOSE:
+ return "CLOSE";
+ case IORING_OP_FILES_UPDATE:
+ return "FILES_UPDATE";
+ case IORING_OP_STATX:
+ return "STATX";
+ case IORING_OP_READ:
+ return "READ";
+ case IORING_OP_WRITE:
+ return "WRITE";
+ case IORING_OP_FADVISE:
+ return "FADVISE";
+ case IORING_OP_MADVISE:
+ return "MADVISE";
+ case IORING_OP_SEND:
+ return "SEND";
+ case IORING_OP_RECV:
+ return "RECV";
+ case IORING_OP_OPENAT2:
+ return "OPENAT2";
+ case IORING_OP_EPOLL_CTL:
+ return "EPOLL_CTL";
+ case IORING_OP_SPLICE:
+ return "SPLICE";
+ case IORING_OP_PROVIDE_BUFFERS:
+ return "PROVIDE_BUFFERS";
+ case IORING_OP_REMOVE_BUFFERS:
+ return "REMOVE_BUFFERS";
+ case IORING_OP_TEE:
+ return "TEE";
+ case IORING_OP_SHUTDOWN:
+ return "SHUTDOWN";
+ case IORING_OP_RENAMEAT:
+ return "RENAMEAT";
+ case IORING_OP_UNLINKAT:
+ return "UNLINKAT";
+ case IORING_OP_MKDIRAT:
+ return "MKDIRAT";
+ case IORING_OP_SYMLINKAT:
+ return "SYMLINKAT";
+ case IORING_OP_LINKAT:
+ return "LINKAT";
+ case IORING_OP_MSG_RING:
+ return "MSG_RING";
+ case IORING_OP_FSETXATTR:
+ return "FSETXATTR";
+ case IORING_OP_SETXATTR:
+ return "SETXATTR";
+ case IORING_OP_FGETXATTR:
+ return "FGETXATTR";
+ case IORING_OP_GETXATTR:
+ return "GETXATTR";
+ case IORING_OP_SOCKET:
+ return "SOCKET";
+ case IORING_OP_URING_CMD:
+ return "URING_CMD";
+ case IORING_OP_LAST:
+ return "INVALID";
+ }
+ return "INVALID";
+}
+
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
@@ -1219,6 +1492,42 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#if defined(CONFIG_UNIX)
+static inline bool io_file_need_scm(struct file *filp)
+{
+#if defined(IO_URING_SCM_ALL)
+ return true;
+#else
+ return !!unix_get_socket(filp);
+#endif
+}
+#else
+static inline bool io_file_need_scm(struct file *filp)
+{
+ return false;
+}
+#endif
+
+static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+ lockdep_assert_held(&ctx->uring_lock);
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
+{
+ /*
+ * "Normal" inline submissions always hold the uring_lock, since we
+ * grab it from the system call. Same is true for the SQPOLL offload.
+ * The only exception is when we've detached the request and issue it
+ * from an async worker thread, grab the lock for that case.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
+}
+
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
@@ -1280,31 +1589,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
#define IO_RSRC_REF_BATCH 100
+static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
+{
+ percpu_ref_put_many(&node->refs, nr);
+}
+
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- struct percpu_ref *ref = req->fixed_rsrc_refs;
+ struct io_rsrc_node *node = req->rsrc_node;
- if (ref) {
- if (ref == &ctx->rsrc_node->refs)
+ if (node) {
+ if (node == ctx->rsrc_node)
ctx->rsrc_cached_refs++;
else
- percpu_ref_put(ref);
+ io_rsrc_put_node(node, 1);
}
}
-static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc(struct io_kiocb *req)
{
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
+ if (req->rsrc_node)
+ io_rsrc_put_node(req->rsrc_node, 1);
}
static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
if (ctx->rsrc_cached_refs) {
- percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
ctx->rsrc_cached_refs = 0;
}
}
@@ -1320,8 +1634,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned int issue_flags)
{
- if (!req->fixed_rsrc_refs) {
- req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
+ if (!req->rsrc_node) {
+ req->rsrc_node = ctx->rsrc_node;
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
lockdep_assert_held(&ctx->uring_lock);
@@ -1329,28 +1643,30 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
if (unlikely(ctx->rsrc_cached_refs < 0))
io_rsrc_refs_refill(ctx);
} else {
- percpu_ref_get(req->fixed_rsrc_refs);
+ percpu_ref_get(&req->rsrc_node->refs);
}
}
}
static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
- struct io_buffer *kbuf = req->kbuf;
- unsigned int cflags;
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list)
+ req->buf_list->head++;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ } else {
+ list_add(&req->kbuf->list, list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ }
- cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- list_add(&kbuf->list, list);
- req->kbuf = NULL;
- return cflags;
+ return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
lockdep_assert_held(&req->ctx->completion_lock);
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}
@@ -1360,7 +1676,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
{
unsigned int cflags;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
/*
@@ -1375,7 +1691,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
- if (issue_flags & IO_URING_F_UNLOCKED) {
+ if (req->flags & REQ_F_BUFFER_RING) {
+ /* no buffers to recycle for this case */
+ cflags = __io_put_kbuf(req, NULL);
+ } else if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
@@ -1393,15 +1712,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
- struct list_head *hash_list;
- struct io_buffer_list *bl;
+ if (ctx->io_bl && bgid < BGID_ARRAY)
+ return &ctx->io_bl[bgid];
- hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- list_for_each_entry(bl, hash_list, list)
- if (bl->bgid == bgid || bgid == -1U)
- return bl;
-
- return NULL;
+ return xa_load(&ctx->io_bl_xa, bgid);
}
static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
@@ -1410,25 +1724,33 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
struct io_buffer_list *bl;
struct io_buffer *buf;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return;
/* don't recycle if we already did IO to this buffer */
if (req->flags & REQ_F_PARTIAL_IO)
return;
+ /*
+ * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+ * the flag and hence ensure that bl->head doesn't get incremented.
+ * If the tail has already been incremented, hang on to it.
+ */
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list) {
+ req->buf_index = req->buf_list->bgid;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ }
+ return;
+ }
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_lock(&ctx->uring_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, issue_flags);
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->kbuf = NULL;
+ req->buf_index = buf->bgid;
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_unlock(&ctx->uring_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -1469,7 +1791,12 @@ static inline void req_set_fail(struct io_kiocb *req)
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
req_set_fail(req);
- req->result = res;
+ req->cqe.res = res;
+}
+
+static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1506,12 +1833,14 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int i, hash_bits;
+ int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ xa_init(&ctx->io_bl_xa);
+
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
@@ -1533,13 +1862,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
- ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
- sizeof(struct list_head), GFP_KERNEL);
- if (!ctx->io_buffers)
- goto err;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
- INIT_LIST_HEAD(&ctx->io_buffers[i]);
-
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1575,7 +1897,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
}
@@ -1599,10 +1922,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_NOWAIT 0x1UL
-#define FFS_ISREG 0x2UL
-#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
-
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
return req->flags & REQ_F_FIXED_FILE;
@@ -1629,6 +1948,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return __io_prep_linked_timeout(req);
}
+static noinline void __io_arm_ltimeout(struct io_kiocb *req)
+{
+ io_queue_linked_timeout(__io_prep_linked_timeout(req));
+}
+
+static inline void io_arm_ltimeout(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
+ __io_arm_ltimeout(req);
+}
+
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1641,6 +1971,7 @@ static void io_prep_async_work(struct io_kiocb *req)
req->work.list.next = NULL;
req->work.flags = 0;
+ req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -1672,17 +2003,15 @@ static void io_prep_async_link(struct io_kiocb *req)
static inline void io_req_add_compl_list(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
+ struct io_submit_state *state = &req->ctx->submit_state;
if (!(req->flags & REQ_F_CQE_SKIP))
- ctx->submit_state.flush_cqes = true;
+ state->flush_cqes = true;
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}
-static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
+static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
@@ -1702,8 +2031,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
- &req->work, io_wq_is_hashed(&req->work));
+ trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
+ req->opcode, req->flags, &req->work,
+ io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1721,8 +2051,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_fill_cqe_req(req, status, 0);
- io_put_req_deferred(req);
+ io_req_tw_post_queue(req, status, 0);
}
}
@@ -1804,21 +2133,53 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+/*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
+static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
- unsigned tail, mask = ctx->cq_entries - 1;
-
- /*
- * writes to the cq entry need to come after reading head; the
- * control dependency is enough as we're using WRITE_ONCE to
- * fill the cq entry
- */
- if (__io_cqring_events(ctx) == ctx->cq_entries)
+ unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
+ unsigned int shift = 0;
+ unsigned int free, queued, len;
+
+ if (ctx->flags & IORING_SETUP_CQE32)
+ shift = 1;
+
+ /* userspace may cheat modifying the tail, be safe and do min */
+ queued = min(__io_cqring_events(ctx), ctx->cq_entries);
+ free = ctx->cq_entries - queued;
+ /* we need a contiguous range, limit based on the current array offset */
+ len = min(free, ctx->cq_entries - off);
+ if (!len)
return NULL;
- tail = ctx->cached_cq_tail++;
- return &rings->cqes[tail & mask];
+ ctx->cached_cq_tail++;
+ ctx->cqe_cached = &rings->cqes[off];
+ ctx->cqe_sentinel = ctx->cqe_cached + len;
+ ctx->cqe_cached++;
+ return &rings->cqes[off << shift];
+}
+
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
+{
+ if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
+ struct io_uring_cqe *cqe = ctx->cqe_cached;
+
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
+
+ cqe += off;
+ }
+
+ ctx->cached_cq_tail++;
+ ctx->cqe_cached++;
+ return cqe;
+ }
+
+ return __io_get_cqe(ctx);
}
static void io_eventfd_signal(struct io_ring_ctx *ctx)
@@ -1889,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
bool all_flushed, posted;
+ size_t cqe_size = sizeof(struct io_uring_cqe);
if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
return false;
+ if (ctx->flags & IORING_SETUP_CQE32)
+ cqe_size <<= 1;
+
posted = false;
spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->cq_overflow_list)) {
@@ -1904,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
if (cqe)
- memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+ memcpy(cqe, &ocqe->cqe, cqe_size);
else
io_account_cq_overflow(ctx);
@@ -1915,13 +2280,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
- clear_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
+ clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
+ atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
- if (posted)
- io_commit_cqring(ctx);
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
@@ -1932,7 +2295,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
bool ret = true;
- if (test_bit(0, &ctx->check_cq_overflow)) {
+ if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
@@ -1944,19 +2307,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
return ret;
}
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
+static void __io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
- if (likely(task == current)) {
- tctx->cached_refs += nr;
- } else {
- percpu_counter_sub(&tctx->inflight, nr);
- if (unlikely(atomic_read(&tctx->in_idle)))
- wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
- }
+ percpu_counter_sub(&tctx->inflight, nr);
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ wake_up(&tctx->wait);
+ put_task_struct_many(task, nr);
+}
+
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+ if (likely(task == current))
+ task->io_uring->cached_refs += nr;
+ else
+ __io_put_task(task, nr);
}
static void io_task_refs_refill(struct io_uring_task *tctx)
@@ -1990,11 +2357,18 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
+ s32 res, u32 cflags, u64 extra1,
+ u64 extra2)
{
struct io_overflow_cqe *ocqe;
+ size_t ocq_size = sizeof(struct io_overflow_cqe);
+ bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+
+ if (is_cqe32)
+ ocq_size += sizeof(struct io_uring_cqe);
- ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+ ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
+ trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
@@ -2002,17 +2376,21 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
* on the floor.
*/
io_account_cq_overflow(ctx);
+ set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+ set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
+ atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
+ if (is_cqe32) {
+ ocqe->cqe.big_cqe[0] = extra1;
+ ocqe->cqe.big_cqe[1] = extra2;
+ }
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
@@ -2034,42 +2412,114 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
WRITE_ONCE(cqe->flags, cflags);
return true;
}
- return io_cqring_event_overflow(ctx, user_data, res, cflags);
+ return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+}
+
+static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_uring_cqe *cqe;
+
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, 0, 0);
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(*cqe));
+ return true;
+ }
+ return io_cqring_event_overflow(ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, 0, 0);
+}
+
+static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_uring_cqe *cqe;
+ u64 extra1 = req->extra1;
+ u64 extra2 = req->extra2;
+
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, extra1, extra2);
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
+ cqe->big_cqe[0] = extra1;
+ cqe->big_cqe[1] = extra2;
+ return true;
+ }
+
+ return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res,
+ req->cqe.flags, extra1, extra2);
}
static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
- trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
- return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0);
+ return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
}
-static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags,
+ u64 extra1, u64 extra2)
{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_cqe *cqe;
+
+ if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+ return;
+ if (req->flags & REQ_F_CQE_SKIP)
+ return;
+
+ trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags,
+ extra1, extra2);
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ WRITE_ONCE(cqe->user_data, req->cqe.user_data);
+ WRITE_ONCE(cqe->res, res);
+ WRITE_ONCE(cqe->flags, cflags);
+ WRITE_ONCE(cqe->big_cqe[0], extra1);
+ WRITE_ONCE(cqe->big_cqe[1], extra2);
+ return;
+ }
+
+ io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
{
ctx->cq_extra++;
- trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
return __io_fill_cqe(ctx, user_data, res, cflags);
}
-static void __io_req_complete_post(struct io_kiocb *req, s32 res,
- u32 cflags)
+static void __io_req_complete_put(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
*/
if (req_ref_put_and_test(req)) {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (req->flags & IO_REQ_LINK_FLAGS) {
if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
if (req->link) {
@@ -2077,7 +2527,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
req->link = NULL;
}
}
- io_req_put_rsrc(req, ctx);
+ io_req_put_rsrc(req);
/*
* Selected buffer deallocation in io_clean_op() assumes that
* we don't hold ->completion_lock. Clean them here to avoid
@@ -2091,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
}
}
-static void io_req_complete_post(struct io_kiocb *req, s32 res,
- u32 cflags)
+static void __io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
+{
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, res, cflags);
+ __io_req_complete_put(req);
+}
+
+static void __io_req_complete_post32(struct io_kiocb *req, s32 res,
+ u32 cflags, u64 extra1, u64 extra2)
+{
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe32_req(req, res, cflags, extra1, extra2);
+ __io_req_complete_put(req);
+}
+
+static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2103,11 +2568,23 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
io_cqring_ev_posted(ctx);
}
+static void io_req_complete_post32(struct io_kiocb *req, s32 res,
+ u32 cflags, u64 extra1, u64 extra2)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ __io_req_complete_post32(req, res, cflags, extra1, extra2);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+}
+
static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
u32 cflags)
{
- req->result = res;
- req->cflags = cflags;
+ req->cqe.res = res;
+ req->cqe.flags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
@@ -2120,8 +2597,23 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
+static inline void __io_req_complete32(struct io_kiocb *req,
+ unsigned int issue_flags, s32 res,
+ u32 cflags, u64 extra1, u64 extra2)
+{
+ if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
+ io_req_complete_state(req, res, cflags);
+ req->extra1 = extra1;
+ req->extra2 = extra2;
+ } else {
+ io_req_complete_post32(req, res, cflags, extra1, extra2);
+ }
+}
+
static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
+ if (res < 0)
+ req_set_fail(req);
__io_req_complete(req, 0, res, 0);
}
@@ -2131,17 +2623,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res)
io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
-static void io_req_complete_fail_submit(struct io_kiocb *req)
-{
- /*
- * We don't submit, fail them all, for that replace hardlinks with
- * normal links. Extra REQ_F_LINK is tolerated.
- */
- req->flags &= ~REQ_F_HARDLINK;
- req->flags |= REQ_F_LINK;
- io_req_complete_failed(req, req->result);
-}
-
/*
* Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations.
@@ -2152,7 +2633,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
req->link = NULL;
req->async_data = NULL;
/* not necessary, but safer to zero */
- req->result = 0;
+ req->cqe.res = 0;
}
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
@@ -2164,19 +2645,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock);
}
-/* Returns true IFF there are requests in the cache */
-static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
+static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
{
- struct io_submit_state *state = &ctx->submit_state;
-
- /*
- * If we have more than a batch's worth of requests in our IRQ side
- * locked cache, grab the lock and move them over to our submission
- * side cache.
- */
- if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
- io_flush_cached_locked_reqs(ctx, state);
- return !!state->free_list.next;
+ return !ctx->submit_state.free_list.next;
}
/*
@@ -2188,14 +2659,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
void *reqs[IO_REQ_ALLOC_BATCH];
- struct io_kiocb *req;
int ret, i;
- if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
- return true;
+ /*
+ * If we have more than a batch's worth of requests in our IRQ side
+ * locked cache, grab the lock and move them over to our submission
+ * side cache.
+ */
+ if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
+ io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
+ if (!io_req_cache_empty(ctx))
+ return true;
+ }
ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
@@ -2212,17 +2689,17 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
percpu_ref_get_many(&ctx->refs, ret);
for (i = 0; i < ret; i++) {
- req = reqs[i];
+ struct io_kiocb *req = reqs[i];
io_preinit_req(req, ctx);
- wq_stack_add_head(&req->comp_list, &state->free_list);
+ io_req_add_to_cache(req, ctx);
}
return true;
}
static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
{
- if (unlikely(!ctx->submit_state.free_list.next))
+ if (unlikely(io_req_cache_empty(ctx)))
return __io_alloc_req_refill(ctx);
return true;
}
@@ -2251,11 +2728,11 @@ static inline void io_dismantle_req(struct io_kiocb *req)
io_put_file(req->file);
}
-static __cold void __io_free_req(struct io_kiocb *req)
+static __cold void io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- io_req_put_rsrc(req, ctx);
+ io_req_put_rsrc(req);
io_dismantle_req(req);
io_put_task(req->task, 1);
@@ -2273,7 +2750,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req)
nxt->link = NULL;
}
-static bool io_kill_linked_timeout(struct io_kiocb *req)
+static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
@@ -2286,13 +2763,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
link->timeout.head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
list_del(&link->timeout.list);
- /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
- io_fill_cqe_req(link, -ECANCELED, 0);
- io_put_req_deferred(link);
- return true;
+ return link;
}
}
- return false;
+ return NULL;
}
static void io_fail_links(struct io_kiocb *req)
@@ -2306,19 +2780,19 @@ static void io_fail_links(struct io_kiocb *req)
long res = -ECANCELED;
if (link->flags & REQ_F_FAIL)
- res = link->result;
+ res = link->cqe.res;
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
req->opcode, link);
- if (!ignore_cqes) {
+ if (ignore_cqes)
+ link->flags |= REQ_F_CQE_SKIP;
+ else
link->flags &= ~REQ_F_CQE_SKIP;
- io_fill_cqe_req(link, res, 0);
- }
- io_put_req_deferred(link);
+ __io_req_complete_post(link, res, 0);
link = nxt;
}
}
@@ -2326,25 +2800,27 @@ static void io_fail_links(struct io_kiocb *req)
static bool io_disarm_next(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
+ struct io_kiocb *link = NULL;
bool posted = false;
if (req->flags & REQ_F_ARM_LTIMEOUT) {
- struct io_kiocb *link = req->link;
-
+ link = req->link;
req->flags &= ~REQ_F_ARM_LTIMEOUT;
if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
io_remove_next_linked(req);
- /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
- io_fill_cqe_req(link, -ECANCELED, 0);
- io_put_req_deferred(link);
+ io_req_tw_post_queue(link, -ECANCELED, 0);
posted = true;
}
} else if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock_irq(&ctx->timeout_lock);
- posted = io_kill_linked_timeout(req);
+ link = io_disarm_linked_timeout(req);
spin_unlock_irq(&ctx->timeout_lock);
+ if (link) {
+ posted = true;
+ io_req_tw_post_queue(link, -ECANCELED, 0);
+ }
}
if (unlikely((req->flags & REQ_F_FAIL) &&
!(req->flags & REQ_F_HARDLINK))) {
@@ -2361,8 +2837,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req)
spin_lock(&ctx->completion_lock);
posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(ctx);
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
if (posted)
io_cqring_ev_posted(ctx);
@@ -2372,8 +2847,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
@@ -2391,6 +2864,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (*locked) {
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -2434,7 +2909,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
if (likely(*uring_locked))
req->io_task_work.func(req, uring_locked);
else
- __io_req_complete_post(req, req->result,
+ __io_req_complete_post(req, req->cqe.res,
io_put_kbuf_comp(req));
node = next;
} while (node);
@@ -2475,15 +2950,11 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node1, *node2;
- if (!tctx->task_list.first &&
- !tctx->prior_task_list.first && uring_locked)
- io_submit_flush_completions(ctx);
-
spin_lock_irq(&tctx->task_lock);
- node1 = tctx->prior_task_list.first;
+ node1 = tctx->prio_task_list.first;
node2 = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prior_task_list);
+ INIT_WQ_LIST(&tctx->prio_task_list);
if (!node2 && !node1)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
@@ -2492,10 +2963,13 @@ static void tctx_task_work(struct callback_head *cb)
if (node1)
handle_prev_tw_list(node1, &ctx, &uring_locked);
-
if (node2)
handle_tw_list(node2, &ctx, &uring_locked);
cond_resched();
+
+ if (data_race(!tctx->task_list.first) &&
+ data_race(!tctx->prio_task_list.first) && uring_locked)
+ io_submit_flush_completions(ctx);
}
ctx_flush_and_put(ctx, &uring_locked);
@@ -2505,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb)
io_uring_drop_tctx_refs(current);
}
-static void io_req_task_work_add(struct io_kiocb *req, bool priority)
+static void __io_req_task_work_add(struct io_kiocb *req,
+ struct io_uring_task *tctx,
+ struct io_wq_work_list *list)
{
- struct task_struct *tsk = req->task;
- struct io_uring_task *tctx = tsk->io_uring;
- enum task_work_notify_mode notify;
+ struct io_ring_ctx *ctx = req->ctx;
struct io_wq_work_node *node;
unsigned long flags;
bool running;
- WARN_ON_ONCE(!tctx);
-
io_drop_inflight_file(req);
spin_lock_irqsave(&tctx->task_lock, flags);
- if (priority)
- wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
- else
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ wq_list_add_tail(&req->io_task_work.node, list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -2532,22 +3001,15 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
if (running)
return;
- /*
- * SQPOLL kernel thread doesn't need notification, just a wakeup. For
- * all other cases, use TWA_SIGNAL unconditionally to ensure we're
- * processing task_work. There's no reliable way to tell if TWA_RESUME
- * will do the job.
- */
- notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
- if (notify == TWA_NONE)
- wake_up_process(tsk);
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
+ if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;
- }
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
+ node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -2559,47 +3021,73 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
}
}
-static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
+static void io_req_task_work_add(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ __io_req_task_work_add(req, tctx, &tctx->task_list);
+}
+
+static void io_req_task_prio_work_add(struct io_kiocb *req)
+{
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ if (req->ctx->flags & IORING_SETUP_SQPOLL)
+ __io_req_task_work_add(req, tctx, &tctx->prio_task_list);
+ else
+ __io_req_task_work_add(req, tctx, &tctx->task_list);
+}
+
+static void io_req_tw_post(struct io_kiocb *req, bool *locked)
+{
+ io_req_complete_post(req, req->cqe.res, req->cqe.flags);
+}
+
+static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ req->cqe.res = res;
+ req->cqe.flags = cflags;
+ req->io_task_work.func = io_req_tw_post;
+ io_req_task_work_add(req);
+}
+static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
+{
/* not needed for normal modes, but SQPOLL depends on it */
- io_tw_lock(ctx, locked);
- io_req_complete_failed(req, req->result);
+ io_tw_lock(req->ctx, locked);
+ io_req_complete_failed(req, req->cqe.res);
}
static void io_req_task_submit(struct io_kiocb *req, bool *locked)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- io_tw_lock(ctx, locked);
+ io_tw_lock(req->ctx, locked);
/* req->task == current here, checking PF_EXITING is safe */
if (likely(!(req->task->flags & PF_EXITING)))
- __io_queue_sqe(req);
+ io_queue_sqe(req);
else
io_req_complete_failed(req, -EFAULT);
}
static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
- req->result = ret;
+ req->cqe.res = ret;
req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
}
static void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
}
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
- req->io_task_work.func = io_queue_async_work;
- io_req_task_work_add(req, false);
+ req->io_task_work.func = io_queue_iowq;
+ io_req_task_work_add(req);
}
-static inline void io_queue_next(struct io_kiocb *req)
+static void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2607,17 +3095,6 @@ static inline void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt);
}
-static void io_free_req(struct io_kiocb *req)
-{
- io_queue_next(req);
- __io_free_req(req);
-}
-
-static void io_free_req_work(struct io_kiocb *req, bool *locked)
-{
- io_free_req(req);
-}
-
static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
@@ -2629,15 +3106,30 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (unlikely(req->flags & REQ_F_REFCOUNT)) {
- node = req->comp_list.next;
- if (!req_ref_put_and_test(req))
- continue;
+ if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+ if (req->flags & REQ_F_REFCOUNT) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
+ if (req->flags & IO_REQ_LINK_FLAGS)
+ io_queue_next(req);
+ if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
+ io_clean_op(req);
}
+ if (!(req->flags & REQ_F_FIXED_FILE))
+ io_put_file(req->file);
io_req_put_rsrc_locked(req, ctx);
- io_queue_next(req);
- io_dismantle_req(req);
if (req->task != task) {
if (task)
@@ -2647,7 +3139,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
}
task_refs++;
node = req->comp_list.next;
- wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ io_req_add_to_cache(req, ctx);
} while (node);
if (task)
@@ -2666,16 +3158,11 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, req->result, req->cflags);
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- struct async_poll *apoll = req->apoll;
-
- if (apoll->double_poll)
- kfree(apoll->double_poll);
- list_add(&apoll->poll.wait.entry,
- &ctx->apoll_cache);
- req->flags &= ~REQ_F_POLLED;
+ if (!(req->flags & REQ_F_CQE_SKIP)) {
+ if (!(ctx->flags & IORING_SETUP_CQE32))
+ __io_fill_cqe_req_filled(ctx, req);
+ else
+ __io_fill_cqe32_req_filled(ctx, req);
}
}
@@ -2698,23 +3185,18 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
struct io_kiocb *nxt = NULL;
if (req_ref_put_and_test(req)) {
- nxt = io_req_find_next(req);
- __io_free_req(req);
+ if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
+ nxt = io_req_find_next(req);
+ io_free_req(req);
}
return nxt;
}
static inline void io_put_req(struct io_kiocb *req)
{
- if (req_ref_put_and_test(req))
- io_free_req(req);
-}
-
-static inline void io_put_req_deferred(struct io_kiocb *req)
-{
if (req_ref_put_and_test(req)) {
- req->io_task_work.func = io_free_req_work;
- io_req_task_work_add(req, false);
+ io_queue_next(req);
+ io_free_req(req);
}
}
@@ -2797,11 +3279,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
/* order with io_complete_rw_iopoll(), e.g. ->result updates */
if (!smp_load_acquire(&req->iopoll_completed))
break;
+ nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
-
- __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
- nr_events++;
+ __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
}
if (unlikely(!nr_events))
@@ -2847,22 +3328,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
unsigned int nr_events = 0;
int ret = 0;
+ unsigned long check_cq;
/*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
- /*
* Don't enter poll loop if we already have events pending.
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (test_bit(0, &ctx->check_cq_overflow))
+ check_cq = READ_ONCE(ctx->check_cq);
+ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
__io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx))
- goto out;
+ return 0;
+
+ /*
+ * Similarly do not spin if we have not informed the user of any
+ * dropped CQE.
+ */
+ if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
+ return -EBADR;
+
do {
/*
* If a submit got punted to a workqueue, we can have the
@@ -2892,8 +3377,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
nr_events += ret;
ret = 0;
} while (nr_events < min && !need_resched());
-out:
- mutex_unlock(&ctx->uring_lock);
+
return ret;
}
@@ -2966,21 +3450,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
} else {
fsnotify_access(req->file);
}
- if (unlikely(res != req->result)) {
+ if (unlikely(res != req->cqe.res)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
return true;
}
req_set_fail(req);
- req->result = res;
+ req->cqe.res = res;
}
return false;
}
static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- int res = req->result;
+ int res = req->cqe.res;
if (*locked) {
io_req_complete_state(req, res, io_put_kbuf(req, 0));
@@ -2996,7 +3480,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result,
+ __io_req_complete(req, issue_flags, req->cqe.res,
io_put_kbuf(req, issue_flags));
}
@@ -3006,9 +3490,9 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
if (__io_complete_rw_common(req, res))
return;
- req->result = res;
+ req->cqe.res = res;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
+ io_req_task_prio_work_add(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
@@ -3017,12 +3501,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (unlikely(res != req->result)) {
+ if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
return;
}
- req->result = res;
+ req->cqe.res = res;
}
/* order with io_iopoll_complete() checking ->iopoll_completed */
@@ -3132,6 +3616,8 @@ static unsigned int io_file_get_flags(struct file *file)
res |= FFS_ISREG;
if (__io_file_supports_nowait(file, mode))
res |= FFS_NOWAIT;
+ if (io_file_need_scm(file))
+ res |= FFS_SCM;
return res;
}
@@ -3163,6 +3649,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->rw.flags = READ_ONCE(sqe->rw_flags);
+ /* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -3311,77 +3798,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
return __io_import_fixed(req, rw, iter, imu);
}
-static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
-{
- if (needs_lock)
- mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
+static int io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
{
- /*
- * "Normal" inline submissions always hold the uring_lock, since we
- * grab it from the system call. Same is true for the SQPOLL offload.
- * The only exception is when we've detached the request and issue it
- * from an async worker thread, grab the lock for that case.
- */
- if (needs_lock)
- mutex_lock(&ctx->uring_lock);
-}
-
-static void io_buffer_add_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned int bgid)
-{
- struct list_head *list;
-
- list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- INIT_LIST_HEAD(&bl->buf_list);
bl->bgid = bgid;
- list_add(&bl->list, list);
+ if (bgid < BGID_ARRAY)
+ return 0;
+
+ return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
-static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, unsigned int issue_flags)
+static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl)
{
- struct io_buffer *kbuf = req->kbuf;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
-
- if (req->flags & REQ_F_BUFFER_SELECTED)
- return kbuf;
+ if (!list_empty(&bl->buf_list)) {
+ struct io_buffer *kbuf;
- io_ring_submit_lock(ctx, needs_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
-
- bl = io_buffer_get_list(ctx, bgid);
- if (bl && !list_empty(&bl->buf_list)) {
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
+ req->buf_index = kbuf->bid;
+ return u64_to_user_ptr(kbuf->addr);
+ }
+ return NULL;
+}
+
+static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
+{
+ struct io_uring_buf_ring *br = bl->buf_ring;
+ struct io_uring_buf *buf;
+ __u32 head = bl->head;
+
+ if (unlikely(smp_load_acquire(&br->tail) == head)) {
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return NULL;
+ }
+
+ head &= bl->mask;
+ if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+ buf = &br->bufs[head];
} else {
- kbuf = ERR_PTR(-ENOBUFS);
+ int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+ int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+ buf = page_address(bl->buf_pages[index]);
+ buf += off;
}
+ if (*len > buf->len)
+ *len = buf->len;
+ req->flags |= REQ_F_BUFFER_RING;
+ req->buf_list = bl;
+ req->buf_index = buf->bid;
- io_ring_submit_unlock(req->ctx, needs_lock);
- return kbuf;
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ /*
+ * If we came in unlocked, we have no choice but to consume the
+ * buffer here. This does mean it'll be pinned until the IO
+ * completes. But coming in unlocked means we're in io-wq
+ * context, hence there should be no further retry. For the
+ * locked case, the caller must ensure to call the commit when
+ * the transfer completes (or if we get -EAGAIN and must poll
+ * or retry).
+ */
+ req->buf_list = NULL;
+ bl->head++;
+ }
+ return u64_to_user_ptr(buf->addr);
}
-static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
+static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+ unsigned int issue_flags)
{
- struct io_buffer *kbuf;
- u16 bgid;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ void __user *ret = NULL;
+
+ io_ring_submit_lock(req->ctx, issue_flags);
- bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, issue_flags);
- if (IS_ERR(kbuf))
- return kbuf;
- return u64_to_user_ptr(kbuf->addr);
+ bl = io_buffer_get_list(ctx, req->buf_index);
+ if (likely(bl)) {
+ if (bl->buf_nr_pages)
+ ret = io_ring_buffer_select(req, len, bl, issue_flags);
+ else
+ ret = io_provided_buffer_select(req, len, bl);
+ }
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -3391,7 +3897,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
struct compat_iovec __user *uiov;
compat_ssize_t clen;
void __user *buf;
- ssize_t len;
+ size_t len;
uiov = u64_to_user_ptr(req->rw.addr);
if (!access_ok(uiov, sizeof(*uiov)))
@@ -3402,11 +3908,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = (compat_size_t) len;
+ req->rw.len = iov[0].iov_len = (compat_size_t) len;
return 0;
}
#endif
@@ -3424,22 +3931,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = len;
+ req->rw.len = iov[0].iov_len = len;
return 0;
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
unsigned int issue_flags)
{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf = req->kbuf;
-
- iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov[0].iov_len = kbuf->len;
+ if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
+ iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
+ iov[0].iov_len = req->rw.len;
return 0;
}
if (req->rw.len != 1)
@@ -3453,6 +3959,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, issue_flags);
}
+static inline bool io_do_buffer_select(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return false;
+ return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
+}
+
static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
struct io_rw_state *s,
unsigned int issue_flags)
@@ -3471,18 +3984,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
return NULL;
}
- /* buffer index only valid with fixed read/write, or buffer select */
- if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
- return ERR_PTR(-EINVAL);
-
buf = u64_to_user_ptr(req->rw.addr);
sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
- if (IS_ERR(buf))
- return ERR_CAST(buf);
+ if (io_do_buffer_select(req)) {
+ buf = io_buffer_select(req, &sqe_len, issue_flags);
+ if (!buf)
+ return ERR_PTR(-ENOBUFS);
+ req->rw.addr = (unsigned long) buf;
req->rw.len = sqe_len;
}
@@ -3784,6 +4294,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
+ kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
@@ -3832,9 +4343,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
iovec = NULL;
}
ret = io_rw_init_file(req, FMODE_READ);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ kfree(iovec);
return ret;
- req->result = iov_iter_count(&s->iter);
+ }
+ req->cqe.res = iov_iter_count(&s->iter);
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
@@ -3850,7 +4363,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ppos = io_kiocb_update_pos(req);
- ret = rw_verify_area(READ, req->file, ppos, req->result);
+ ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3872,7 +4385,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret == req->result || ret <= 0 || !force_nonblock ||
+ } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
@@ -3958,9 +4471,11 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
iovec = NULL;
}
ret = io_rw_init_file(req, FMODE_WRITE);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ kfree(iovec);
return ret;
- req->result = iov_iter_count(&s->iter);
+ }
+ req->cqe.res = iov_iter_count(&s->iter);
if (force_nonblock) {
/* If the file doesn't support async, just async punt */
@@ -3980,7 +4495,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
ppos = io_kiocb_update_pos(req);
- ret = rw_verify_area(WRITE, req->file, ppos, req->result);
+ ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
if (unlikely(ret))
goto out_free;
@@ -4044,9 +4559,7 @@ static int io_renameat_prep(struct io_kiocb *req,
struct io_rename *ren = &req->rename;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4083,22 +4596,257 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
ren->newpath, ren->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
+static inline void __io_xattr_finish(struct io_kiocb *req)
+{
+ struct io_xattr *ix = &req->xattr;
+
+ if (ix->filename)
+ putname(ix->filename);
+
+ kfree(ix->ctx.kname);
+ kvfree(ix->ctx.kvalue);
+}
+
+static void io_xattr_finish(struct io_kiocb *req, int ret)
+{
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ __io_xattr_finish(req);
+ io_req_complete(req, ret);
+}
+
+static int __io_getxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *name;
+ int ret;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ix->filename = NULL;
+ ix->ctx.kvalue = NULL;
+ name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ix->ctx.size = READ_ONCE(sqe->len);
+ ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+ if (ix->ctx.flags)
+ return -EINVAL;
+
+ ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+ if (!ix->ctx.kname)
+ return -ENOMEM;
+
+ ret = strncpy_from_user(ix->ctx.kname->name, name,
+ sizeof(ix->ctx.kname->name));
+ if (!ret || ret == sizeof(ix->ctx.kname->name))
+ ret = -ERANGE;
+ if (ret < 0) {
+ kfree(ix->ctx.kname);
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_fgetxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return __io_getxattr_prep(req, sqe);
+}
+
+static int io_getxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *path;
+ int ret;
+
+ ret = __io_getxattr_prep(req, sqe);
+ if (ret)
+ return ret;
+
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+ ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+ if (IS_ERR(ix->filename)) {
+ ret = PTR_ERR(ix->filename);
+ ix->filename = NULL;
+ }
+
+ return ret;
+}
+
+static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_xattr *ix = &req->xattr;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
+ req->file->f_path.dentry,
+ &ix->ctx);
+
+ io_xattr_finish(req, ret);
+ return 0;
+}
+
+static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_xattr *ix = &req->xattr;
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+ struct path path;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+retry:
+ ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+ if (!ret) {
+ ret = do_getxattr(mnt_user_ns(path.mnt),
+ path.dentry,
+ &ix->ctx);
+
+ path_put(&path);
+ if (retry_estale(ret, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+ }
+
+ io_xattr_finish(req, ret);
+ return 0;
+}
+
+static int __io_setxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *name;
+ int ret;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ix->filename = NULL;
+ name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ix->ctx.kvalue = NULL;
+ ix->ctx.size = READ_ONCE(sqe->len);
+ ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+ ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+ if (!ix->ctx.kname)
+ return -ENOMEM;
+
+ ret = setxattr_copy(name, &ix->ctx);
+ if (ret) {
+ kfree(ix->ctx.kname);
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_setxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_xattr *ix = &req->xattr;
+ const char __user *path;
+ int ret;
+
+ ret = __io_setxattr_prep(req, sqe);
+ if (ret)
+ return ret;
+
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+ ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+ if (IS_ERR(ix->filename)) {
+ ret = PTR_ERR(ix->filename);
+ ix->filename = NULL;
+ }
+
+ return ret;
+}
+
+static int io_fsetxattr_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return __io_setxattr_prep(req, sqe);
+}
+
+static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
+ struct path *path)
+{
+ struct io_xattr *ix = &req->xattr;
+ int ret;
+
+ ret = mnt_want_write(path->mnt);
+ if (!ret) {
+ ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
+ mnt_drop_write(path->mnt);
+ }
+
+ return ret;
+}
+
+static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ ret = __io_setxattr(req, issue_flags, &req->file->f_path);
+ io_xattr_finish(req, ret);
+
+ return 0;
+}
+
+static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_xattr *ix = &req->xattr;
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+ struct path path;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+retry:
+ ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
+ if (!ret) {
+ ret = __io_setxattr(req, issue_flags, &path);
+ path_put(&path);
+ if (retry_estale(ret, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+ }
+
+ io_xattr_finish(req, ret);
+ return 0;
+}
+
static int io_unlinkat_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_unlink *un = &req->unlink;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4132,8 +4880,6 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_unlinkat(un->dfd, un->filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4144,10 +4890,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
struct io_mkdir *mkd = &req->mkdir;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4175,8 +4918,6 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4187,10 +4928,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
struct io_symlink *sl = &req->symlink;
const char __user *oldpath, *newpath;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4224,8 +4962,6 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4236,9 +4972,7 @@ static int io_linkat_prep(struct io_kiocb *req,
struct io_hardlink *lnk = &req->hardlink;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4275,9 +5009,97 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
lnk->newpath, lnk->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
+{
+ req->uring_cmd.task_work_cb(&req->uring_cmd);
+}
+
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *))
+{
+ struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
+ req->uring_cmd.task_work_cb = task_work_cb;
+ req->io_task_work.func = io_uring_cmd_work;
+ io_req_task_prio_work_add(req);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
+
+/*
+ * Called by consumers of io_uring_cmd, if they originally returned
+ * -EIOCBQUEUED upon receiving the command.
+ */
+void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
+{
+ struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
if (ret < 0)
req_set_fail(req);
- io_req_complete(req, ret);
+ if (req->ctx->flags & IORING_SETUP_CQE32)
+ __io_req_complete32(req, 0, ret, 0, res2, 0);
+ else
+ io_req_complete(req, ret);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+
+static int io_uring_cmd_prep_async(struct io_kiocb *req)
+{
+ size_t cmd_size;
+
+ cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
+
+ memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
+ return 0;
+}
+
+static int io_uring_cmd_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_uring_cmd *ioucmd = &req->uring_cmd;
+
+ if (sqe->rw_flags)
+ return -EINVAL;
+ ioucmd->cmd = sqe->cmd;
+ ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
+ return 0;
+}
+
+static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_uring_cmd *ioucmd = &req->uring_cmd;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = req->file;
+ int ret;
+
+ if (!req->file->f_op->uring_cmd)
+ return -EOPNOTSUPP;
+
+ if (ctx->flags & IORING_SETUP_SQE128)
+ issue_flags |= IO_URING_F_SQE128;
+ if (ctx->flags & IORING_SETUP_CQE32)
+ issue_flags |= IO_URING_F_CQE32;
+ if (ctx->flags & IORING_SETUP_IOPOLL)
+ issue_flags |= IO_URING_F_IOPOLL;
+
+ if (req_has_async_data(req))
+ ioucmd->cmd = req->async_data;
+
+ ret = file->f_op->uring_cmd(ioucmd, issue_flags);
+ if (ret == -EAGAIN) {
+ if (!req_has_async_data(req)) {
+ if (io_alloc_async_data(req))
+ return -ENOMEM;
+ io_uring_cmd_prep_async(req);
+ }
+ return -EAGAIN;
+ }
+
+ if (ret != -EIOCBQUEUED)
+ io_uring_cmd_done(ioucmd, ret, 0);
return 0;
}
@@ -4285,9 +5107,7 @@ static int io_shutdown_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
@@ -4312,8 +5132,6 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
ret = __sys_shutdown_sock(sock, req->shutdown.how);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -4327,9 +5145,6 @@ static int __io_splice_prep(struct io_kiocb *req,
struct io_splice *sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
if (unlikely(sp->flags & ~valid_flags))
@@ -4374,7 +5189,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
done:
if (ret != sp->len)
req_set_fail(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, 0, ret, 0);
return 0;
}
@@ -4419,7 +5234,20 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
done:
if (ret != sp->len)
req_set_fail(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, 0, ret, 0);
+ return 0;
+}
+
+static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ /*
+ * If the ring is setup with CQE32, relay back addr/addr
+ */
+ if (req->ctx->flags & IORING_SETUP_CQE32) {
+ req->nop.extra1 = READ_ONCE(sqe->addr);
+ req->nop.extra2 = READ_ONCE(sqe->addr2);
+ }
+
return 0;
}
@@ -4428,20 +5256,31 @@ done:
*/
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
+ unsigned int cflags;
+ void __user *buf;
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ size_t len = 1;
+
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ }
- __io_req_complete(req, issue_flags, 0, 0);
+ cflags = io_put_kbuf(req, issue_flags);
+ if (!(req->ctx->flags & IORING_SETUP_CQE32))
+ __io_req_complete(req, issue_flags, 0, cflags);
+ else
+ __io_req_complete32(req, issue_flags, 0, cflags,
+ req->nop.extra1, req->nop.extra2);
return 0;
}
static int io_msg_ring_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
- sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
+ sqe->buf_index || sqe->personality))
return -EINVAL;
req->msg.user_data = READ_ONCE(sqe->off);
@@ -4477,17 +5316,15 @@ done:
if (ret < 0)
req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
+ /* put file to avoid an attempt to IOPOLL the req */
+ io_put_file(req->file);
+ req->file = NULL;
return 0;
}
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
+ if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -4511,8 +5348,6 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4520,10 +5355,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -4541,9 +5373,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
- if (ret < 0)
- req_set_fail(req);
- else
+ if (ret >= 0)
fsnotify_modify(req->file);
io_req_complete(req, ret);
return 0;
@@ -4554,9 +5384,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4611,6 +5439,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_openat_prep(req, sqe);
}
+static int io_file_bitmap_get(struct io_ring_ctx *ctx)
+{
+ struct io_file_table *table = &ctx->file_table;
+ unsigned long nr = ctx->nr_user_files;
+ int ret;
+
+ if (table->alloc_hint >= nr)
+ table->alloc_hint = 0;
+
+ do {
+ ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
+ if (ret != nr) {
+ table->alloc_hint = ret + 1;
+ return ret;
+ }
+ if (!table->alloc_hint)
+ break;
+
+ nr = table->alloc_hint;
+ table->alloc_hint = 0;
+ } while (1);
+
+ return -ENFILE;
+}
+
+static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+ struct file *file, unsigned int file_slot)
+{
+ bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ if (alloc_slot) {
+ io_ring_submit_lock(ctx, issue_flags);
+ ret = io_file_bitmap_get(ctx);
+ if (unlikely(ret < 0)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+ }
+
+ file_slot = ret;
+ } else {
+ file_slot--;
+ }
+
+ ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+ if (alloc_slot) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ if (!ret)
+ return file_slot;
+ }
+
+ return ret;
+}
+
static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
@@ -4666,8 +5549,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
fd_install(ret, file);
else
- ret = io_install_fixed_file(req, file, issue_flags,
- req->open.file_slot - 1);
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ req->open.file_slot);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -4688,7 +5571,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+ if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
sqe->splice_fd_in)
return -EINVAL;
@@ -4711,6 +5594,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs)
return 0;
+ if (bl->buf_nr_pages) {
+ int j;
+
+ i = bl->buf_ring->tail - bl->head;
+ for (j = 0; j < bl->buf_nr_pages; j++)
+ unpin_user_page(bl->buf_pages[j]);
+ kvfree(bl->buf_pages);
+ bl->buf_pages = NULL;
+ bl->buf_nr_pages = 0;
+ /* make sure it's seen as empty */
+ INIT_LIST_HEAD(&bl->buf_list);
+ return i;
+ }
+
/* the head kbuf is the list itself */
while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
@@ -4732,22 +5629,23 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
-
- io_ring_submit_lock(ctx, needs_lock);
- lockdep_assert_held(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
bl = io_buffer_get_list(ctx, p->bgid);
- if (bl)
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ if (bl) {
+ ret = -EINVAL;
+ /* can't use provide/remove buffers command on mapped buffers */
+ if (!bl->buf_nr_pages)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ }
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return 0;
}
@@ -4758,7 +5656,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -4855,26 +5753,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
+static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
+{
+ int i;
+
+ ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
+ GFP_KERNEL);
+ if (!ctx->io_bl)
+ return -ENOMEM;
+
+ for (i = 0; i < BGID_ARRAY; i++) {
+ INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
+ ctx->io_bl[i].bgid = i;
+ }
+
+ return 0;
+}
+
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
- lockdep_assert_held(&ctx->uring_lock);
+ if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
+ ret = io_init_bl_list(ctx);
+ if (ret)
+ goto err;
+ }
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
- bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl) {
ret = -ENOMEM;
goto err;
}
- io_buffer_add_list(ctx, bl, p->bgid);
+ INIT_LIST_HEAD(&bl->buf_list);
+ ret = io_buffer_add_list(ctx, bl, p->bgid);
+ if (ret) {
+ kfree(bl);
+ goto err;
+ }
+ }
+ /* can't add buffers via this command for a mapped buffer ring */
+ if (bl->buf_nr_pages) {
+ ret = -EINVAL;
+ goto err;
}
ret = io_add_buffers(ctx, p, bl);
@@ -4883,7 +5811,7 @@ err:
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return 0;
}
@@ -4891,9 +5819,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -4937,9 +5863,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
@@ -4961,8 +5885,6 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -4972,9 +5894,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
@@ -5010,9 +5930,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *path;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -5048,19 +5966,13 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
ctx->buffer);
-
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- sqe->rw_flags || sqe->buf_index)
+ if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -5092,7 +6004,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
spin_unlock(&files->file_lock);
goto err;
}
- file = fdt->fd[close->fd];
+ file = rcu_dereference_protected(fdt->fd[close->fd],
+ lockdep_is_held(&files->file_lock));
if (!file || file->f_op == &io_uring_fops) {
spin_unlock(&files->file_lock);
file = NULL;
@@ -5126,12 +6039,7 @@ err:
static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
+ if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -5150,13 +6058,18 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
#if defined(CONFIG_NET)
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
@@ -5202,11 +6115,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(sqe->file_index))
+ return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+ sr->flags = READ_ONCE(sqe->addr2);
+ if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -5215,12 +6133,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -5239,7 +6159,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg = &iomsg;
}
- flags = req->sr_msg.msg_flags;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg);
+
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5252,12 +6176,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg);
+ }
req_set_fail(req);
}
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5272,6 +6205,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
int min_ret = 0;
int ret;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -5285,7 +6222,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_controllen = 0;
msg.msg_namelen = 0;
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5298,8 +6235,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
req_set_fail(req);
}
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5391,14 +6339,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
return __io_recvmsg_copy_hdr(req, iomsg);
}
-static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
-}
-
static int io_recvmsg_prep_async(struct io_kiocb *req)
{
int ret;
@@ -5413,12 +6353,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(sqe->file_index))
+ return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
- sr->bgid = READ_ONCE(sqe->buf_group);
+ sr->flags = READ_ONCE(sqe->addr2);
+ if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -5431,19 +6375,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static bool io_net_retry(struct socket *sock, int flags)
-{
- if (!(flags & MSG_WAITALL))
- return false;
- return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
-}
-
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
- struct io_buffer *kbuf;
+ unsigned int cflags;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -5461,24 +6398,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg = &iomsg;
}
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- kmsg->fast_iov[0].iov_len = req->sr_msg.len;
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
- 1, req->sr_msg.len);
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg);
+
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &sr->len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ kmsg->fast_iov[0].iov_base = buf;
+ kmsg->fast_iov[0].iov_len = sr->len;
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
+ sr->len);
}
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
- ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
- kmsg->uaddr, flags);
+ kmsg->msg.msg_get_inq = 1;
+ ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock)
return io_setup_async_msg(req, kmsg);
@@ -5502,45 +6445,54 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
+ cflags = io_put_kbuf(req, issue_flags);
+ if (kmsg->msg.msg_inq)
+ cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ __io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_buffer *kbuf;
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
- void __user *buf = sr->buf;
struct socket *sock;
struct iovec iov;
+ unsigned int cflags;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- buf = u64_to_user_ptr(kbuf->addr);
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &sr->len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ sr->buf = buf;
}
- ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
+ ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
goto out_free;
msg.msg_name = NULL;
+ msg.msg_namelen = 0;
msg.msg_control = NULL;
+ msg.msg_get_inq = 1;
+ msg.msg_flags = 0;
msg.msg_controllen = 0;
- msg.msg_namelen = 0;
msg.msg_iocb = NULL;
- msg.msg_flags = 0;
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5569,36 +6521,49 @@ out_free:
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
+ cflags = io_put_kbuf(req, issue_flags);
+ if (msg.msg_inq)
+ cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ __io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = &req->accept;
+ unsigned flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index)
+ if (sqe->len || sqe->buf_index)
return -EINVAL;
accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
+ flags = READ_ONCE(sqe->ioprio);
+ if (flags & ~IORING_ACCEPT_MULTISHOT)
+ return -EINVAL;
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
- return -EINVAL;
+ if (accept->file_slot) {
+ if (accept->flags & SOCK_CLOEXEC)
+ return -EINVAL;
+ if (flags & IORING_ACCEPT_MULTISHOT &&
+ accept->file_slot != IORING_FILE_INDEX_ALLOC)
+ return -EINVAL;
+ }
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+ if (flags & IORING_ACCEPT_MULTISHOT)
+ req->flags |= REQ_F_APOLL_MULTISHOT;
return 0;
}
static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_accept *accept = &req->accept;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -5606,6 +6571,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
+retry:
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -5617,7 +6583,89 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
put_unused_fd(fd);
ret = PTR_ERR(file);
- if (ret == -EAGAIN && force_nonblock)
+ if (ret == -EAGAIN && force_nonblock) {
+ /*
+ * if it's multishot and polled, we don't need to
+ * return EAGAIN to arm the poll infra since it
+ * has already been done
+ */
+ if ((req->flags & IO_APOLL_MULTI_POLLED) ==
+ IO_APOLL_MULTI_POLLED)
+ ret = 0;
+ return ret;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ } else if (!fixed) {
+ fd_install(fd, file);
+ ret = fd;
+ } else {
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ accept->file_slot);
+ }
+
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
+ }
+ if (ret >= 0) {
+ bool filled;
+
+ spin_lock(&ctx->completion_lock);
+ filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
+ IORING_CQE_F_MORE);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (filled) {
+ io_cqring_ev_posted(ctx);
+ goto retry;
+ }
+ ret = -ECANCELED;
+ }
+
+ return ret;
+}
+
+static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_socket *sock = &req->sock;
+
+ if (sqe->addr || sqe->rw_flags || sqe->buf_index)
+ return -EINVAL;
+
+ sock->domain = READ_ONCE(sqe->fd);
+ sock->type = READ_ONCE(sqe->off);
+ sock->protocol = READ_ONCE(sqe->len);
+ sock->file_slot = READ_ONCE(sqe->file_index);
+ sock->nofile = rlimit(RLIMIT_NOFILE);
+
+ sock->flags = sock->type & ~SOCK_TYPE_MASK;
+ if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
+ return -EINVAL;
+ if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ return 0;
+}
+
+static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_socket *sock = &req->sock;
+ bool fixed = !!sock->file_slot;
+ struct file *file;
+ int ret, fd;
+
+ if (!fixed) {
+ fd = __get_unused_fd_flags(sock->flags, sock->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+ }
+ file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
+ if (IS_ERR(file)) {
+ if (!fixed)
+ put_unused_fd(fd);
+ ret = PTR_ERR(file);
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -5627,7 +6675,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
ret = fd;
} else {
ret = io_install_fixed_file(req, file, issue_flags,
- accept->file_slot - 1);
+ sock->file_slot - 1);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
@@ -5645,10 +6693,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
+ if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5721,6 +6766,7 @@ IO_NETOP_PREP_ASYNC(sendmsg);
IO_NETOP_PREP_ASYNC(recvmsg);
IO_NETOP_PREP_ASYNC(connect);
IO_NETOP_PREP(accept);
+IO_NETOP_PREP(socket);
IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */
@@ -5771,7 +6817,7 @@ static void io_poll_req_insert(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
struct hlist_head *list;
- list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+ list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
hlist_add_head(&req->hash_node, list);
}
@@ -5830,22 +6876,23 @@ static void io_poll_remove_entries(struct io_kiocb *req)
rcu_read_unlock();
}
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
/*
* All poll tw should go through this. Checks for poll events, manages
* references, does rewait, etc.
*
* Returns a negative error on failure. >0 when no action require, which is
* either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->result.
+ * the request, then the mask is stored in req->cqe.res.
*/
-static int io_poll_check_events(struct io_kiocb *req, bool locked)
+static int io_poll_check_events(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
- int v;
+ int v, ret;
/* req->task == current here, checking PF_EXITING is safe */
if (unlikely(req->task->flags & PF_EXITING))
- io_poll_mark_cancelled(req);
+ return -ECANCELED;
do {
v = atomic_read(&req->poll_refs);
@@ -5856,32 +6903,46 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked)
if (v & IO_POLL_CANCEL_FLAG)
return -ECANCELED;
- if (!req->result) {
+ if (!req->cqe.res) {
struct poll_table_struct pt = { ._key = req->apoll_events };
unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
if (unlikely(!io_assign_file(req, flags)))
return -EBADF;
- req->result = vfs_poll(req->file, &pt) & req->apoll_events;
+ req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
}
- /* multishot, just fill an CQE and proceed */
- if (req->result && !(req->apoll_events & EPOLLONESHOT)) {
- __poll_t mask = mangle_poll(req->result & req->apoll_events);
+ if ((unlikely(!req->cqe.res)))
+ continue;
+ if (req->apoll_events & EPOLLONESHOT)
+ return 0;
+
+ /* multishot, just fill a CQE and proceed */
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ __poll_t mask = mangle_poll(req->cqe.res &
+ req->apoll_events);
bool filled;
spin_lock(&ctx->completion_lock);
- filled = io_fill_cqe_aux(ctx, req->user_data, mask,
- IORING_CQE_F_MORE);
+ filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
+ mask, IORING_CQE_F_MORE);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
- if (unlikely(!filled))
- return -ECANCELED;
- io_cqring_ev_posted(ctx);
- } else if (req->result) {
- return 0;
+ if (filled) {
+ io_cqring_ev_posted(ctx);
+ continue;
+ }
+ return -ECANCELED;
}
+ io_tw_lock(req->ctx, locked);
+ if (unlikely(req->task->flags & PF_EXITING))
+ return -EFAULT;
+ ret = io_issue_sqe(req,
+ IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (ret)
+ return ret;
+
/*
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
@@ -5896,21 +6957,21 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req, *locked);
+ ret = io_poll_check_events(req, locked);
if (ret > 0)
return;
if (!ret) {
- req->result = mangle_poll(req->result & req->poll.events);
+ req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
} else {
- req->result = ret;
+ req->cqe.res = ret;
req_set_fail(req);
}
io_poll_remove_entries(req);
spin_lock(&ctx->completion_lock);
hash_del(&req->hash_node);
- __io_req_complete_post(req, req->result, 0);
+ __io_req_complete_post(req, req->cqe.res, 0);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -5921,7 +6982,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req, *locked);
+ ret = io_poll_check_events(req, locked);
if (ret > 0)
return;
@@ -5936,9 +6997,9 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
+static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
{
- req->result = mask;
+ req->cqe.res = mask;
/*
* This is useful for poll that is armed on behalf of another
* request, and where the wakeup path could be on a different
@@ -5951,11 +7012,12 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
else
req->io_task_work.func = io_apoll_task_func;
- trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
- io_req_task_work_add(req, false);
+ trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
+ io_req_task_work_add(req);
}
-static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
+static inline void io_poll_execute(struct io_kiocb *req, int res,
+ __poll_t events)
{
if (io_poll_get_ownership(req))
__io_poll_execute(req, res, events);
@@ -5970,6 +7032,7 @@ static void io_poll_cancel_req(struct io_kiocb *req)
#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
+#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
@@ -6004,7 +7067,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
}
/* for instances that support it check for an event match first */
- if (mask && !(mask & poll->events))
+ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
return 0;
if (io_poll_get_ownership(req)) {
@@ -6090,6 +7153,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
int v;
INIT_HLIST_NODE(&req->hash_node);
+ req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
@@ -6160,28 +7224,34 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
+ __poll_t mask = POLLPRI | POLLERR;
int ret;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
- if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ if (!file_can_poll(req->file))
+ return IO_APOLL_ABORTED;
+ if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
return IO_APOLL_ABORTED;
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT))
+ mask |= EPOLLONESHOT;
if (def->pollin) {
- mask |= POLLIN | POLLRDNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
if ((req->opcode == IORING_OP_RECVMSG) &&
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
- mask &= ~POLLIN;
+ mask &= ~EPOLLIN;
} else {
- mask |= POLLOUT | POLLWRNORM;
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
if (def->poll_exclusive)
mask |= EPOLLEXCLUSIVE;
- if (!(issue_flags & IO_URING_F_UNLOCKED) &&
- !list_empty(&ctx->apoll_cache)) {
+ if (req->flags & REQ_F_POLLED) {
+ apoll = req->apoll;
+ } else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ !list_empty(&ctx->apoll_cache)) {
apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
poll.wait.entry);
list_del_init(&apoll->poll.wait.entry);
@@ -6201,7 +7271,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
if (ret || ipt.error)
return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
- trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
+ trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
mask, apoll->poll.events);
return IO_APOLL_OK;
}
@@ -6234,24 +7304,53 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
return found;
}
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
- bool poll_only)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
+ struct io_cancel_data *cd)
__must_hold(&ctx->completion_lock)
{
struct hlist_head *list;
struct io_kiocb *req;
- list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
+ list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
hlist_for_each_entry(req, list, hash_node) {
- if (sqe_addr != req->user_data)
+ if (cd->data != req->cqe.user_data)
continue;
if (poll_only && req->opcode != IORING_OP_POLL_ADD)
continue;
+ if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
+ if (cd->seq == req->work.cancel_seq)
+ continue;
+ req->work.cancel_seq = cd->seq;
+ }
return req;
}
return NULL;
}
+static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
+ struct io_cancel_data *cd)
+ __must_hold(&ctx->completion_lock)
+{
+ struct io_kiocb *req;
+ int i;
+
+ for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[i];
+ hlist_for_each_entry(req, list, hash_node) {
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+ req->file != cd->file)
+ continue;
+ if (cd->seq == req->work.cancel_seq)
+ continue;
+ req->work.cancel_seq = cd->seq;
+ return req;
+ }
+ }
+ return NULL;
+}
+
static bool io_poll_disarm(struct io_kiocb *req)
__must_hold(&ctx->completion_lock)
{
@@ -6262,12 +7361,15 @@ static bool io_poll_disarm(struct io_kiocb *req)
return true;
}
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
- bool poll_only)
+static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
__must_hold(&ctx->completion_lock)
{
- struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
+ struct io_kiocb *req;
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
+ req = io_poll_file_find(ctx, cd);
+ else
+ req = io_poll_find(ctx, false, cd);
if (!req)
return -ENOENT;
io_poll_cancel_req(req);
@@ -6294,9 +7396,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
struct io_poll_update *upd = &req->poll_update;
u32 flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -6326,9 +7426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
struct io_poll_iocb *poll = &req->poll;
u32 flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+ if (sqe->buf_index || sqe->off || sqe->addr)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~IORING_POLL_ADD_MULTI)
@@ -6358,13 +7456,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
{
+ struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *preq;
int ret2, ret = 0;
bool locked;
spin_lock(&ctx->completion_lock);
- preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
+ preq = io_poll_find(ctx, true, &cd);
if (!preq || !io_poll_disarm(preq)) {
spin_unlock(&ctx->completion_lock);
ret = preq ? -EALREADY : -ENOENT;
@@ -6380,7 +7479,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
preq->poll.events |= IO_POLL_UNMASK;
}
if (req->poll_update.update_user_data)
- preq->user_data = req->poll_update.new_user_data;
+ preq->cqe.user_data = req->poll_update.new_user_data;
ret2 = io_poll_add(preq, issue_flags);
/* successfully updated, don't complete poll request */
@@ -6389,7 +7488,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
}
req_set_fail(preq);
- preq->result = -ECANCELED;
+ preq->cqe.res = -ECANCELED;
locked = !(issue_flags & IO_URING_F_UNLOCKED);
io_req_task_complete(preq, &locked);
out:
@@ -6417,14 +7516,14 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
req_set_fail(req);
- req->result = -ETIME;
+ req->cqe.res = -ETIME;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
- __u64 user_data)
+ struct io_cancel_data *cd)
__must_hold(&ctx->timeout_lock)
{
struct io_timeout_data *io;
@@ -6432,9 +7531,16 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
bool found = false;
list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- found = user_data == req->user_data;
- if (found)
- break;
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
+ cd->data != req->cqe.user_data)
+ continue;
+ if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+ if (cd->seq == req->work.cancel_seq)
+ continue;
+ req->work.cancel_seq = cd->seq;
+ }
+ found = true;
+ break;
}
if (!found)
return ERR_PTR(-ENOENT);
@@ -6446,11 +7552,14 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
return req;
}
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
__must_hold(&ctx->completion_lock)
- __must_hold(&ctx->timeout_lock)
{
- struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_kiocb *req;
+
+ spin_lock_irq(&ctx->timeout_lock);
+ req = io_timeout_extract(ctx, cd);
+ spin_unlock_irq(&ctx->timeout_lock);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -6483,7 +7592,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
bool found = false;
list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
- found = user_data == req->user_data;
+ found = user_data == req->cqe.user_data;
if (found)
break;
}
@@ -6503,7 +7612,8 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
- struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_cancel_data cd = { .data = user_data, };
+ struct io_kiocb *req = io_timeout_extract(ctx, &cd);
struct io_timeout_data *data;
if (IS_ERR(req))
@@ -6523,11 +7633,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
struct io_timeout_rem *tr = &req->timeout_rem;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
tr->ltimeout = false;
@@ -6568,10 +7676,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
int ret;
if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+ struct io_cancel_data cd = { .data = tr->addr, };
+
spin_lock(&ctx->completion_lock);
- spin_lock_irq(&ctx->timeout_lock);
- ret = io_timeout_cancel(ctx, tr->addr);
- spin_unlock_irq(&ctx->timeout_lock);
+ ret = io_timeout_cancel(ctx, &cd);
spin_unlock(&ctx->completion_lock);
} else {
enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
@@ -6597,10 +7705,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
unsigned flags;
u32 off = READ_ONCE(sqe->off);
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
- sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
return -EINVAL;
@@ -6699,30 +7804,42 @@ add:
return 0;
}
-struct io_cancel_data {
- struct io_ring_ctx *ctx;
- u64 user_data;
-};
-
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_cancel_data *cd = data;
- return req->ctx == cd->ctx && req->user_data == cd->user_data;
+ if (req->ctx != cd->ctx)
+ return false;
+ if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
+ ;
+ } else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
+ if (req->file != cd->file)
+ return false;
+ } else {
+ if (req->cqe.user_data != cd->data)
+ return false;
+ }
+ if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
+ if (cd->seq == req->work.cancel_seq)
+ return false;
+ req->work.cancel_seq = cd->seq;
+ }
+ return true;
}
-static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
- struct io_ring_ctx *ctx)
+static int io_async_cancel_one(struct io_uring_task *tctx,
+ struct io_cancel_data *cd)
{
- struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
enum io_wq_cancel cancel_ret;
int ret = 0;
+ bool all;
if (!tctx || !tctx->io_wq)
return -ENOENT;
- cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
+ all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+ cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -6738,14 +7855,14 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
return ret;
}
-static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
+static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
- ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
+ ret = io_async_cancel_one(req->task->io_uring, cd);
/*
* Fall-through even for -EALREADY, as we may have poll armed
* that need unarming.
@@ -6754,56 +7871,98 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
return 0;
spin_lock(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, sqe_addr, false);
+ ret = io_poll_cancel(ctx, cd);
if (ret != -ENOENT)
goto out;
-
- spin_lock_irq(&ctx->timeout_lock);
- ret = io_timeout_cancel(ctx, sqe_addr);
- spin_unlock_irq(&ctx->timeout_lock);
+ if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
+ ret = io_timeout_cancel(ctx, cd);
out:
spin_unlock(&ctx->completion_lock);
return ret;
}
+#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
+ IORING_ASYNC_CANCEL_ANY)
+
static int io_async_cancel_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->len || sqe->splice_fd_in)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
+ req->cancel.flags = READ_ONCE(sqe->cancel_flags);
+ if (req->cancel.flags & ~CANCEL_FLAGS)
+ return -EINVAL;
+ if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
+ if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
+ return -EINVAL;
+ req->cancel.fd = READ_ONCE(sqe->fd);
+ }
+
return 0;
}
-static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
+ unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
- u64 sqe_addr = req->cancel.addr;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
+ struct io_ring_ctx *ctx = cd->ctx;
struct io_tctx_node *node;
- int ret;
+ int ret, nr = 0;
- ret = io_try_cancel_userdata(req, sqe_addr);
- if (ret != -ENOENT)
- goto done;
+ do {
+ ret = io_try_cancel(req, cd);
+ if (ret == -ENOENT)
+ break;
+ if (!all)
+ return ret;
+ nr++;
+ } while (1);
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
- ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
- if (ret != -ENOENT)
- break;
+ ret = io_async_cancel_one(tctx, cd);
+ if (ret != -ENOENT) {
+ if (!all)
+ break;
+ nr++;
+ }
}
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
+ return all ? nr : ret;
+}
+
+static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_cancel_data cd = {
+ .ctx = req->ctx,
+ .data = req->cancel.addr,
+ .flags = req->cancel.flags,
+ .seq = atomic_inc_return(&req->ctx->cancel_seq),
+ };
+ int ret;
+
+ if (cd.flags & IORING_ASYNC_CANCEL_FD) {
+ if (req->flags & REQ_F_FIXED_FILE)
+ req->file = io_file_get_fixed(req, req->cancel.fd,
+ issue_flags);
+ else
+ req->file = io_file_get_normal(req, req->cancel.fd);
+ if (!req->file) {
+ ret = -EBADF;
+ goto done;
+ }
+ cd.file = req->file;
+ }
+
+ ret = __io_async_cancel(&cd, req, issue_flags);
done:
if (ret < 0)
req_set_fail(req);
@@ -6816,7 +7975,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
{
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->rsrc_update.offset = READ_ONCE(sqe->off);
@@ -6830,7 +7989,6 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
@@ -6841,10 +7999,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.resv = 0;
up.resv2 = 0;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
req_set_fail(req);
@@ -6856,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
switch (req->opcode) {
case IORING_OP_NOP:
- return 0;
+ return io_nop_prep(req, sqe);
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
@@ -6930,6 +8088,18 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_linkat_prep(req, sqe);
case IORING_OP_MSG_RING:
return io_msg_ring_prep(req, sqe);
+ case IORING_OP_FSETXATTR:
+ return io_fsetxattr_prep(req, sqe);
+ case IORING_OP_SETXATTR:
+ return io_setxattr_prep(req, sqe);
+ case IORING_OP_FGETXATTR:
+ return io_fgetxattr_prep(req, sqe);
+ case IORING_OP_GETXATTR:
+ return io_getxattr_prep(req, sqe);
+ case IORING_OP_SOCKET:
+ return io_socket_prep(req, sqe);
+ case IORING_OP_URING_CMD:
+ return io_uring_cmd_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6939,7 +8109,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_req_prep_async(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].needs_async_setup)
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+
+ /* assign early for deferred execution for non-fixed file */
+ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
+ req->file = io_file_get_normal(req, req->cqe.fd);
+ if (!def->needs_async_setup)
return 0;
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
@@ -6957,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req)
return io_recvmsg_prep_async(req);
case IORING_OP_CONNECT:
return io_connect_prep_async(req);
+ case IORING_OP_URING_CMD:
+ return io_uring_cmd_prep_async(req);
}
printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
req->opcode);
@@ -6966,9 +8143,10 @@ static int io_req_prep_async(struct io_kiocb *req)
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
+ struct io_kiocb *cur;
/* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(req, req)
+ io_for_each_link(cur, req)
seq--;
return seq;
}
@@ -7011,7 +8189,7 @@ fail:
goto queue;
}
- trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
+ trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
@@ -7073,6 +8251,12 @@ static void io_clean_op(struct io_kiocb *req)
if (req->statx.filename)
putname(req->statx.filename);
break;
+ case IORING_OP_SETXATTR:
+ case IORING_OP_FSETXATTR:
+ case IORING_OP_GETXATTR:
+ case IORING_OP_FGETXATTR:
+ __io_xattr_finish(req);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -7095,15 +8279,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
return true;
if (req->flags & REQ_F_FIXED_FILE)
- req->file = io_file_get_fixed(req, req->fd, issue_flags);
+ req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
else
- req->file = io_file_get_normal(req, req->fd);
- if (req->file)
- return true;
+ req->file = io_file_get_normal(req, req->cqe.fd);
- req_set_fail(req);
- req->result = -EBADF;
- return false;
+ return !!req->file;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
@@ -7233,6 +8413,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_MSG_RING:
ret = io_msg_ring(req, issue_flags);
break;
+ case IORING_OP_FSETXATTR:
+ ret = io_fsetxattr(req, issue_flags);
+ break;
+ case IORING_OP_SETXATTR:
+ ret = io_setxattr(req, issue_flags);
+ break;
+ case IORING_OP_FGETXATTR:
+ ret = io_fgetxattr(req, issue_flags);
+ break;
+ case IORING_OP_GETXATTR:
+ ret = io_getxattr(req, issue_flags);
+ break;
+ case IORING_OP_SOCKET:
+ ret = io_socket(req, issue_flags);
+ break;
+ case IORING_OP_URING_CMD:
+ ret = io_uring_cmd(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
@@ -7266,7 +8464,6 @@ static void io_wq_submit_work(struct io_wq_work *work)
const struct io_op_def *def = &io_op_defs[req->opcode];
unsigned int issue_flags = IO_URING_F_UNLOCKED;
bool needs_poll = false;
- struct io_kiocb *timeout;
int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_free_work() after returning to io-wq */
@@ -7275,10 +8472,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
else
req_ref_get(req);
- timeout = io_prep_linked_timeout(req);
- if (timeout)
- io_queue_linked_timeout(timeout);
-
+ io_arm_ltimeout(req);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) {
@@ -7311,6 +8505,8 @@ fail:
* wait for request slots on the block side.
*/
if (!needs_poll) {
+ if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
+ break;
cond_resched();
continue;
}
@@ -7356,8 +8552,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
struct file *file = NULL;
unsigned long file_ptr;
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_lock(&ctx->uring_lock);
+ io_ring_submit_lock(ctx, issue_flags);
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
goto out;
@@ -7368,9 +8563,9 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
io_req_set_rsrc_node(req, ctx, 0);
+ WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
out:
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_unlock(&ctx->uring_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return file;
}
@@ -7391,7 +8586,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
struct file *file = fget(fd);
- trace_io_uring_file_get(req->ctx, req, req->user_data, fd);
+ trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
/* we don't allow fixed io_uring files */
if (file && file->f_op == &io_uring_fops)
@@ -7405,8 +8600,14 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
int ret = -ENOENT;
if (prev) {
- if (!(req->task->flags & PF_EXITING))
- ret = io_try_cancel_userdata(req, prev->user_data);
+ if (!(req->task->flags & PF_EXITING)) {
+ struct io_cancel_data cd = {
+ .ctx = req->ctx,
+ .data = prev->cqe.user_data,
+ };
+
+ ret = io_try_cancel(req, &cd);
+ }
io_req_complete_post(req, ret ?: -ETIME, 0);
io_put_req(prev);
} else {
@@ -7440,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req, false);
+ io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
@@ -7466,10 +8667,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+static void io_queue_async(struct io_kiocb *req, int ret)
__must_hold(&req->ctx->uring_lock)
{
- struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *linked_timeout;
+
+ if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
+ io_req_complete_failed(req, ret);
+ return;
+ }
+
+ linked_timeout = io_prep_linked_timeout(req);
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
@@ -7480,7 +8688,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
- io_queue_async_work(req, NULL);
+ io_queue_iowq(req, NULL);
break;
case IO_APOLL_OK:
break;
@@ -7490,10 +8698,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
io_queue_linked_timeout(linked_timeout);
}
-static inline void __io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- struct io_kiocb *linked_timeout;
int ret;
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
@@ -7506,22 +8713,23 @@ static inline void __io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
- if (likely(!ret)) {
- linked_timeout = io_prep_linked_timeout(req);
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- io_queue_sqe_arm_apoll(req);
- } else {
- io_req_complete_failed(req, ret);
- }
+ if (likely(!ret))
+ io_arm_ltimeout(req);
+ else
+ io_queue_async(req, ret);
}
static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
+ if (unlikely(req->flags & REQ_F_FAIL)) {
+ /*
+ * We don't submit, fail them all, for that replace hardlinks
+ * with normal links. Extra REQ_F_LINK is tolerated.
+ */
+ req->flags &= ~REQ_F_HARDLINK;
+ req->flags |= REQ_F_LINK;
+ io_req_complete_failed(req, req->cqe.res);
} else if (unlikely(req->ctx->drain_active)) {
io_drain_req(req);
} else {
@@ -7530,19 +8738,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
if (unlikely(ret))
io_req_complete_failed(req, ret);
else
- io_queue_async_work(req, NULL);
+ io_queue_iowq(req, NULL);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
- __io_queue_sqe(req);
- else
- io_queue_sqe_fallback(req);
-}
-
/*
* Check SQE restrictions (opcode and flags).
*
@@ -7597,9 +8796,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
- req->user_data = READ_ONCE(sqe->user_data);
+ req->cqe.user_data = READ_ONCE(sqe->user_data);
req->file = NULL;
- req->fixed_rsrc_refs = NULL;
+ req->rsrc_node = NULL;
req->task = current;
if (unlikely(opcode >= IORING_OP_LAST)) {
@@ -7610,9 +8809,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
return -EINVAL;
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[opcode].buffer_select)
- return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_BUFFER_SELECT) {
+ if (!io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ req->buf_index = READ_ONCE(sqe->buf_group);
+ }
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
if (sqe_flags & IOSQE_IO_DRAIN) {
@@ -7635,10 +8836,15 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
}
+ if (!io_op_defs[opcode].ioprio && sqe->ioprio)
+ return -EINVAL;
+ if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
if (io_op_defs[opcode].needs_file) {
struct io_submit_state *state = &ctx->submit_state;
- req->fd = READ_ONCE(sqe->fd);
+ req->cqe.fd = READ_ONCE(sqe->fd);
/*
* Plug now if we have more than 2 IO left after this, and the
@@ -7670,7 +8876,44 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
return io_req_prep(req, sqe);
}
-static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
+ struct io_kiocb *req, int ret)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_link *link = &ctx->submit_state.link;
+ struct io_kiocb *head = link->head;
+
+ trace_io_uring_req_failed(sqe, ctx, req, ret);
+
+ /*
+ * Avoid breaking links in the middle as it renders links with SQPOLL
+ * unusable. Instead of failing eagerly, continue assembling the link if
+ * applicable and mark the head with REQ_F_FAIL. The link flushing code
+ * should find the flag and handle the rest.
+ */
+ req_fail_link_node(req, ret);
+ if (head && !(head->flags & REQ_F_FAIL))
+ req_fail_link_node(head, -ECANCELED);
+
+ if (!(req->flags & IO_REQ_LINK_FLAGS)) {
+ if (head) {
+ link->last->link = req;
+ link->head = NULL;
+ req = head;
+ }
+ io_queue_sqe_fallback(req);
+ return ret;
+ }
+
+ if (head)
+ link->last->link = req;
+ else
+ link->head = req;
+ link->last = req;
+ return 0;
+}
+
+static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
@@ -7678,35 +8921,11 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
int ret;
ret = io_init_req(ctx, req, sqe);
- if (unlikely(ret)) {
- trace_io_uring_req_failed(sqe, ctx, req, ret);
-
- /* fail even hard links since we don't submit */
- if (link->head) {
- /*
- * we can judge a link req is failed or cancelled by if
- * REQ_F_FAIL is set, but the head is an exception since
- * it may be set REQ_F_FAIL because of other req's failure
- * so let's leverage req->result to distinguish if a head
- * is set REQ_F_FAIL because of its failure or other req's
- * failure so that we can set the correct ret code for it.
- * init result here to avoid affecting the normal path.
- */
- if (!(link->head->flags & REQ_F_FAIL))
- req_fail_link_node(link->head, -ECANCELED);
- } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- /*
- * the current req is a normal req, we should return
- * error and thus break the submittion loop.
- */
- io_req_complete_failed(req, ret);
- return ret;
- }
- req_fail_link_node(req, ret);
- }
+ if (unlikely(ret))
+ return io_submit_fail_init(sqe, req, ret);
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
+ trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
req->flags, true,
ctx->flags & IORING_SETUP_SQPOLL);
@@ -7717,29 +8936,32 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
- if (link->head) {
- struct io_kiocb *head = link->head;
-
- if (!(req->flags & REQ_F_FAIL)) {
- ret = io_req_prep_async(req);
- if (unlikely(ret)) {
- req_fail_link_node(req, ret);
- if (!(head->flags & REQ_F_FAIL))
- req_fail_link_node(head, -ECANCELED);
- }
- }
- trace_io_uring_link(ctx, req, head);
+ if (unlikely(link->head)) {
+ ret = io_req_prep_async(req);
+ if (unlikely(ret))
+ return io_submit_fail_init(sqe, req, ret);
+
+ trace_io_uring_link(ctx, req, link->head);
link->last->link = req;
link->last = req;
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ if (req->flags & IO_REQ_LINK_FLAGS)
return 0;
- /* last request of a link, enqueue the link */
+ /* last request of the link, flush it */
+ req = link->head;
link->head = NULL;
- req = head;
- } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
+ if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
+ goto fallback;
+
+ } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
+ REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
+ if (req->flags & IO_REQ_LINK_FLAGS) {
+ link->head = req;
+ link->last = req;
+ } else {
+fallback:
+ io_queue_sqe_fallback(req);
+ }
return 0;
}
@@ -7754,8 +8976,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- if (state->link.head)
- io_queue_sqe(state->link.head);
+ if (unlikely(state->link.head))
+ io_queue_sqe_fallback(state->link.head);
/* flush only after queuing links as they can generate completions */
io_submit_flush_completions(ctx);
if (state->plug_started)
@@ -7809,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
* though the application is the one updating it.
*/
head = READ_ONCE(ctx->sq_array[sq_idx]);
- if (likely(head < ctx->sq_entries))
+ if (likely(head < ctx->sq_entries)) {
+ /* double index for 128-byte SQEs, twice as long */
+ if (ctx->flags & IORING_SETUP_SQE128)
+ head <<= 1;
return &ctx->sq_sqes[head];
+ }
/* drop invalid entries */
ctx->cq_extra--;
@@ -7823,54 +9049,52 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
unsigned int entries = io_sqring_entries(ctx);
- int submitted = 0;
+ unsigned int left;
+ int ret;
if (unlikely(!entries))
return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, entries);
- io_get_task_refs(nr);
+ ret = left = min3(nr, ctx->sq_entries, entries);
+ io_get_task_refs(left);
+ io_submit_state_start(&ctx->submit_state, left);
- io_submit_state_start(&ctx->submit_state, nr);
do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- if (unlikely(!io_alloc_req_refill(ctx))) {
- if (!submitted)
- submitted = -EAGAIN;
+ if (unlikely(!io_alloc_req_refill(ctx)))
break;
- }
req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ io_req_add_to_cache(req, ctx);
break;
}
- /* will complete beyond this point, count as submitted */
- submitted++;
- if (io_submit_sqe(ctx, req, sqe)) {
- /*
- * Continue submitting even for sqe failure if the
- * ring was setup with IORING_SETUP_SUBMIT_ALL
- */
- if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
- break;
- }
- } while (submitted < nr);
- if (unlikely(submitted != nr)) {
- int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
- int unused = nr - ref_used;
+ /*
+ * Continue submitting even for sqe failure if the
+ * ring was setup with IORING_SETUP_SUBMIT_ALL
+ */
+ if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+ !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
+ left--;
+ break;
+ }
+ } while (--left);
- current->io_uring->cached_refs += unused;
+ if (unlikely(left)) {
+ ret -= left;
+ /* try again if it submitted nothing and can't allocate a req */
+ if (!ret && io_req_cache_empty(ctx))
+ ret = -EAGAIN;
+ current->io_uring->cached_refs += left;
}
io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
-
- return submitted;
+ return ret;
}
static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
@@ -7878,23 +9102,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
return READ_ONCE(sqd->state);
}
-static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
-{
- /* Tell userspace we may need a wakeup call */
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
-{
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
unsigned int to_submit;
@@ -8010,8 +9217,8 @@ static int io_sq_thread(void *data)
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- io_ring_set_wakeup_flag(ctx);
-
+ atomic_or(IORING_SQ_NEED_WAKEUP,
+ &ctx->rings->sq_flags);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
!wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
@@ -8022,7 +9229,7 @@ static int io_sq_thread(void *data)
* Ensure the store of the wakeup flag is not
* reordered with the load of the SQ tail
*/
- smp_mb();
+ smp_mb__after_atomic();
if (io_sqring_entries(ctx)) {
needs_sched = false;
@@ -8036,7 +9243,8 @@ static int io_sq_thread(void *data)
mutex_lock(&sqd->lock);
}
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_clear_wakeup_flag(ctx);
+ atomic_andnot(IORING_SQ_NEED_WAKEUP,
+ &ctx->rings->sq_flags);
}
finish_wait(&sqd->wait, &wait);
@@ -8046,7 +9254,7 @@ static int io_sq_thread(void *data)
io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_set_wakeup_flag(ctx);
+ atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
io_run_task_work();
mutex_unlock(&sqd->lock);
@@ -8086,7 +9294,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
* Cannot safely flush overflowed CQEs from here, ensure we wake up
* the task, and the next invocation will do it.
*/
- if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
+ if (io_should_wake(iowq) ||
+ test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
return autoremove_wake_function(curr, mode, wake_flags, key);
return -1;
}
@@ -8108,15 +9317,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
ktime_t timeout)
{
int ret;
+ unsigned long check_cq;
/* make sure we run task_work before checking for signals */
ret = io_run_task_work_sig();
if (ret || io_should_wake(iowq))
return ret;
+ check_cq = READ_ONCE(ctx->check_cq);
/* let the caller flush overflows, retry */
- if (test_bit(0, &ctx->check_cq_overflow))
+ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
return 1;
-
+ if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
+ return -EBADR;
if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
return -ETIME;
return 1;
@@ -8181,10 +9393,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
- finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
+ finish_wait(&ctx->cq_wait, &iowq.wq);
restore_saved_sigmask_unless(ret == -EINTR);
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
@@ -8422,17 +9634,57 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
table->files = kvcalloc(nr_files, sizeof(table->files[0]),
GFP_KERNEL_ACCOUNT);
- return !!table->files;
+ if (unlikely(!table->files))
+ return false;
+
+ table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
+ if (unlikely(!table->bitmap)) {
+ kvfree(table->files);
+ return false;
+ }
+
+ return true;
}
static void io_free_file_tables(struct io_file_table *table)
{
kvfree(table->files);
+ bitmap_free(table->bitmap);
table->files = NULL;
+ table->bitmap = NULL;
+}
+
+static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
+{
+ WARN_ON_ONCE(test_bit(bit, table->bitmap));
+ __set_bit(bit, table->bitmap);
+ if (bit == table->alloc_hint)
+ table->alloc_hint++;
+}
+
+static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
+{
+ __clear_bit(bit, table->bitmap);
+ table->alloc_hint = bit;
}
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
+#if !defined(IO_URING_SCM_ALL)
+ int i;
+
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct file *file = io_file_from_index(ctx, i);
+
+ if (!file)
+ continue;
+ if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+ continue;
+ io_file_bitmap_clear(&ctx->file_table, i);
+ fput(file);
+ }
+#endif
+
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
struct sock *sock = ctx->ring_sock->sk;
@@ -8441,16 +9693,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
kfree_skb(skb);
}
-#else
- int i;
-
- for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file;
-
- file = io_file_from_index(ctx, i);
- if (file)
- fput(file);
- }
#endif
io_free_file_tables(&ctx->file_table);
io_rsrc_data_free(ctx->file_data);
@@ -8595,107 +9837,66 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
return sqd;
}
-#if defined(CONFIG_UNIX)
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
* the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing.
+ * loops in the file referencing. We account only files that can hold other
+ * files because otherwise they can't form a loop and so are not interesting
+ * for GC.
*/
-static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
+static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
{
+#if defined(CONFIG_UNIX)
struct sock *sk = ctx->ring_sock->sk;
+ struct sk_buff_head *head = &sk->sk_receive_queue;
struct scm_fp_list *fpl;
struct sk_buff *skb;
- int i, nr_files;
- fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
- if (!fpl)
- return -ENOMEM;
-
- skb = alloc_skb(0, GFP_KERNEL);
- if (!skb) {
- kfree(fpl);
- return -ENOMEM;
- }
+ if (likely(!io_file_need_scm(file)))
+ return 0;
- skb->sk = sk;
+ /*
+ * See if we can merge this file into an existing skb SCM_RIGHTS
+ * file set. If there's no room, fall back to allocating a new skb
+ * and filling it in.
+ */
+ spin_lock_irq(&head->lock);
+ skb = skb_peek(head);
+ if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
+ __skb_unlink(skb, head);
+ else
+ skb = NULL;
+ spin_unlock_irq(&head->lock);
- nr_files = 0;
- fpl->user = get_uid(current_user());
- for (i = 0; i < nr; i++) {
- struct file *file = io_file_from_index(ctx, i + offset);
+ if (!skb) {
+ fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
- if (!file)
- continue;
- fpl->fp[nr_files] = get_file(file);
- unix_inflight(fpl->user, fpl->fp[nr_files]);
- nr_files++;
- }
+ skb = alloc_skb(0, GFP_KERNEL);
+ if (!skb) {
+ kfree(fpl);
+ return -ENOMEM;
+ }
- if (nr_files) {
+ fpl->user = get_uid(current_user());
fpl->max = SCM_MAX_FD;
- fpl->count = nr_files;
+ fpl->count = 0;
+
UNIXCB(skb).fp = fpl;
+ skb->sk = sk;
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_queue_head(&sk->sk_receive_queue, skb);
-
- for (i = 0; i < nr; i++) {
- struct file *file = io_file_from_index(ctx, i + offset);
-
- if (file)
- fput(file);
- }
- } else {
- kfree_skb(skb);
- free_uid(fpl->user);
- kfree(fpl);
- }
-
- return 0;
-}
-
-/*
- * If UNIX sockets are enabled, fd passing can cause a reference cycle which
- * causes regular reference counting to break down. We rely on the UNIX
- * garbage collection to take care of this problem for us.
- */
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
- unsigned left, total;
- int ret = 0;
-
- total = 0;
- left = ctx->nr_user_files;
- while (left) {
- unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
-
- ret = __io_sqe_files_scm(ctx, this_files, total);
- if (ret)
- break;
- left -= this_files;
- total += this_files;
- }
-
- if (!ret)
- return 0;
-
- while (total < ctx->nr_user_files) {
- struct file *file = io_file_from_index(ctx, total);
-
- if (file)
- fput(file);
- total++;
}
- return ret;
-}
-#else
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
+ fpl = UNIXCB(skb).fp;
+ fpl->fp[fpl->count++] = get_file(file);
+ unix_inflight(fpl->user, file);
+ skb_queue_head(head, skb);
+ fput(file);
+#endif
return 0;
}
-#endif
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
@@ -8706,6 +9907,11 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
struct sk_buff *skb;
int i;
+ if (!io_file_need_scm(file)) {
+ fput(file);
+ return;
+ }
+
__skb_queue_head_init(&list);
/*
@@ -8770,15 +9976,17 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
list_del(&prsrc->list);
if (prsrc->tag) {
- bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
+ if (ctx->flags & IORING_SETUP_IOPOLL)
+ mutex_lock(&ctx->uring_lock);
- io_ring_submit_lock(ctx, lock_ring);
spin_lock(&ctx->completion_lock);
io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_ring_submit_unlock(ctx, lock_ring);
+
+ if (ctx->flags & IORING_SETUP_IOPOLL)
+ mutex_unlock(&ctx->uring_lock);
}
rsrc_data->do_put(ctx, prsrc);
@@ -8832,27 +10040,31 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (ret)
return ret;
- ret = -ENOMEM;
- if (!io_alloc_file_tables(&ctx->file_table, nr_args))
- goto out_free;
+ if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
+ io_rsrc_data_free(ctx->file_data);
+ ctx->file_data = NULL;
+ return -ENOMEM;
+ }
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ struct io_fixed_file *file_slot;
+
+ if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
- goto out_fput;
+ goto fail;
}
/* allow sparse sets */
- if (fd == -1) {
+ if (!fds || fd == -1) {
ret = -EINVAL;
if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
- goto out_fput;
+ goto fail;
continue;
}
file = fget(fd);
ret = -EBADF;
if (unlikely(!file))
- goto out_fput;
+ goto fail;
/*
* Don't allow io_uring instances to be registered. If UNIX
@@ -8863,74 +10075,23 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
*/
if (file->f_op == &io_uring_fops) {
fput(file);
- goto out_fput;
+ goto fail;
}
- io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
- }
-
- ret = io_sqe_files_scm(ctx);
- if (ret) {
- __io_sqe_files_unregister(ctx);
- return ret;
- }
-
- io_rsrc_node_switch(ctx, NULL);
- return ret;
-out_fput:
- for (i = 0; i < ctx->nr_user_files; i++) {
- file = io_file_from_index(ctx, i);
- if (file)
+ ret = io_scm_file_account(ctx, file);
+ if (ret) {
fput(file);
- }
- io_free_file_tables(&ctx->file_table);
- ctx->nr_user_files = 0;
-out_free:
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
- return ret;
-}
-
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
- int index)
-{
-#if defined(CONFIG_UNIX)
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff_head *head = &sock->sk_receive_queue;
- struct sk_buff *skb;
-
- /*
- * See if we can merge this file into an existing skb SCM_RIGHTS
- * file set. If there's no room, fall back to allocating a new skb
- * and filling it in.
- */
- spin_lock_irq(&head->lock);
- skb = skb_peek(head);
- if (skb) {
- struct scm_fp_list *fpl = UNIXCB(skb).fp;
-
- if (fpl->count < SCM_MAX_FD) {
- __skb_unlink(skb, head);
- spin_unlock_irq(&head->lock);
- fpl->fp[fpl->count] = get_file(file);
- unix_inflight(fpl->user, fpl->fp[fpl->count]);
- fpl->count++;
- spin_lock_irq(&head->lock);
- __skb_queue_head(head, skb);
- } else {
- skb = NULL;
+ goto fail;
}
- }
- spin_unlock_irq(&head->lock);
-
- if (skb) {
- fput(file);
- return 0;
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, i);
}
- return __io_sqe_files_scm(ctx, 1, index);
-#else
+ io_rsrc_node_switch(ctx, NULL);
return 0;
-#endif
+fail:
+ __io_sqe_files_unregister(ctx);
+ return ret;
}
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
@@ -8954,12 +10115,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8985,22 +10145,20 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
if (ret)
goto err;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
- ret = io_sqe_file_register(ctx, file, slot_index);
- if (ret) {
- file_slot->file_ptr = 0;
- goto err;
+ ret = io_scm_file_account(ctx, file);
+ if (!ret) {
+ *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, slot_index);
}
-
- ret = 0;
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
if (ret)
fput(file);
return ret;
@@ -9010,12 +10168,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
int ret;
- io_ring_submit_lock(ctx, needs_lock);
+ io_ring_submit_lock(ctx, issue_flags);
ret = -ENXIO;
if (unlikely(!ctx->file_data))
goto out;
@@ -9038,10 +10195,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
goto out;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
- io_ring_submit_unlock(ctx, needs_lock);
+ io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
@@ -9087,6 +10245,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (err)
break;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, i);
needs_switch = true;
}
if (fd != -1) {
@@ -9108,14 +10267,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- *io_get_tag_slot(data, i) = tag;
- io_fixed_file_set(file_slot, file);
- err = io_sqe_file_register(ctx, file, i);
+ err = io_scm_file_account(ctx, file);
if (err) {
- file_slot->file_ptr = 0;
fput(file);
break;
}
+ *io_get_tag_slot(data, i) = tag;
+ io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, i);
}
}
@@ -9195,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prior_task_list);
+ INIT_WQ_LIST(&tctx->prio_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@@ -9373,8 +10532,8 @@ static void *io_mem_alloc(size_t size)
return (void *) __get_free_pages(gfp, get_order(size));
}
-static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
- size_t *sq_offset)
+static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
+ unsigned int cq_entries, size_t *sq_offset)
{
struct io_rings *rings;
size_t off, sq_array_size;
@@ -9382,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
off = struct_size(rings, cqes, cq_entries);
if (off == SIZE_MAX)
return SIZE_MAX;
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ if (check_shl_overflow(off, 1, &off))
+ return SIZE_MAX;
+ }
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
@@ -9543,30 +10706,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf **pimu,
- struct page **last_hpage)
+static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
+ int *npages)
{
- struct io_mapped_ubuf *imu = NULL;
+ unsigned long start, end, nr_pages;
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
- unsigned long off, start, end, ubuf;
- size_t size;
- int ret, pret, nr_pages, i;
+ int i, pret, ret = -ENOMEM;
- if (!iov->iov_base) {
- *pimu = ctx->dummy_ubuf;
- return 0;
- }
-
- ubuf = (unsigned long) iov->iov_base;
- end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
- *pimu = NULL;
- ret = -ENOMEM;
-
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto done;
@@ -9576,10 +10727,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
if (!vmas)
goto done;
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
- if (!imu)
- goto done;
-
ret = 0;
mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
@@ -9597,6 +10744,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
break;
}
}
+ *npages = nr_pages;
} else {
ret = pret < 0 ? pret : -EFAULT;
}
@@ -9610,14 +10758,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unpin_user_pages(pages, pret);
goto done;
}
+ ret = 0;
+done:
+ kvfree(vmas);
+ if (ret < 0) {
+ kvfree(pages);
+ pages = ERR_PTR(ret);
+ }
+ return pages;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+ struct io_mapped_ubuf **pimu,
+ struct page **last_hpage)
+{
+ struct io_mapped_ubuf *imu = NULL;
+ struct page **pages = NULL;
+ unsigned long off;
+ size_t size;
+ int ret, nr_pages, i;
+
+ if (!iov->iov_base) {
+ *pimu = ctx->dummy_ubuf;
+ return 0;
+ }
+
+ *pimu = NULL;
+ ret = -ENOMEM;
+
+ pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+ &nr_pages);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ pages = NULL;
+ goto done;
+ }
+
+ imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ if (!imu)
+ goto done;
- ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
+ ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
if (ret) {
- unpin_user_pages(pages, pret);
+ unpin_user_pages(pages, nr_pages);
goto done;
}
- off = ubuf & ~PAGE_MASK;
+ off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len;
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
@@ -9630,8 +10817,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
size -= vec_len;
}
/* store original address for later verification */
- imu->ubuf = ubuf;
- imu->ubuf_end = ubuf + iov->iov_len;
+ imu->ubuf = (unsigned long) iov->iov_base;
+ imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
*pimu = imu;
ret = 0;
@@ -9639,7 +10826,6 @@ done:
if (ret)
kvfree(imu);
kvfree(pages);
- kvfree(vmas);
return ret;
}
@@ -9698,12 +10884,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- ret = io_copy_iov(ctx, &iov, arg, i);
- if (ret)
- break;
- ret = io_buffer_validate(&iov);
- if (ret)
- break;
+ if (arg) {
+ ret = io_copy_iov(ctx, &iov, arg, i);
+ if (ret)
+ break;
+ ret = io_buffer_validate(&iov);
+ if (ret)
+ break;
+ } else {
+ memset(&iov, 0, sizeof(iov));
+ }
+
if (!iov.iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
@@ -9842,19 +11033,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
+ struct io_buffer_list *bl;
+ unsigned long index;
int i;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
- struct list_head *list = &ctx->io_buffers[i];
-
- while (!list_empty(list)) {
- struct io_buffer_list *bl;
+ for (i = 0; i < BGID_ARRAY; i++) {
+ if (!ctx->io_bl)
+ break;
+ __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
+ }
- bl = list_first_entry(list, struct io_buffer_list, list);
- __io_remove_buffers(ctx, bl, -1U);
- list_del(&bl->list);
- kfree(bl);
- }
+ xa_for_each(&ctx->io_bl_xa, index, bl) {
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ __io_remove_buffers(ctx, bl, -1U);
}
while (!list_empty(&ctx->io_buffers_pages)) {
@@ -9874,7 +11065,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
mutex_lock(&ctx->uring_lock);
io_flush_cached_locked_reqs(ctx, state);
- while (state->free_list.next) {
+ while (!io_req_cache_empty(ctx)) {
struct io_wq_work_node *node;
struct io_kiocb *req;
@@ -9963,7 +11154,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
@@ -9994,7 +11186,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
* pushs them to do the flush.
*/
- if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
+ if (io_cqring_events(ctx) ||
+ test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
@@ -10126,8 +11319,7 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
}
}
spin_unlock_irq(&ctx->timeout_lock);
- if (canceled != 0)
- io_commit_cqring(ctx);
+ io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
if (canceled != 0)
io_cqring_ev_posted(ctx);
@@ -10147,11 +11339,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx, NULL, true);
- io_poll_remove_all(ctx, NULL, true);
-
- /* if we failed setting up the ctx, we might not have any rings */
- io_iopoll_try_reap_events(ctx);
+ /* failed during ring init, it couldn't have issued any requests */
+ if (ctx->rings) {
+ io_kill_timeouts(ctx, NULL, true);
+ io_poll_remove_all(ctx, NULL, true);
+ /* if we failed setting up the ctx, we might not have any rings */
+ io_iopoll_try_reap_events(ctx);
+ }
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
@@ -10243,6 +11437,10 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
+ /* failed during ring init, it couldn't have issued any requests */
+ if (!ctx->rings)
+ return;
+
while (1) {
enum io_wq_cancel cret;
bool ret = false;
@@ -10585,7 +11783,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
ret = -EFAULT;
break;
}
- if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) {
+ if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
ret = -EINVAL;
break;
}
@@ -10688,6 +11886,19 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
return 0;
}
+static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+{
+ if (flags & IORING_ENTER_EXT_ARG) {
+ struct io_uring_getevents_arg arg;
+
+ if (argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
struct __kernel_timespec __user **ts,
const sigset_t __user **sig)
@@ -10725,7 +11936,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
size_t, argsz)
{
struct io_ring_ctx *ctx;
- int submitted = 0;
struct fd f;
long ret;
@@ -10788,39 +11998,64 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (ret)
goto out;
}
- submitted = to_submit;
+ ret = to_submit;
} else if (to_submit) {
ret = io_uring_add_tctx_node(ctx);
if (unlikely(ret))
goto out;
- mutex_lock(&ctx->uring_lock);
- submitted = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
- if (submitted != to_submit)
+ mutex_lock(&ctx->uring_lock);
+ ret = io_submit_sqes(ctx, to_submit);
+ if (ret != to_submit) {
+ mutex_unlock(&ctx->uring_lock);
goto out;
+ }
+ if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
+ goto iopoll_locked;
+ mutex_unlock(&ctx->uring_lock);
}
if (flags & IORING_ENTER_GETEVENTS) {
- const sigset_t __user *sig;
- struct __kernel_timespec __user *ts;
-
- ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
- if (unlikely(ret))
- goto out;
+ int ret2;
+ if (ctx->syscall_iopoll) {
+ /*
+ * We disallow the app entering submit/complete with
+ * polling, but we still need to lock the ring to
+ * prevent racing with polled issue that got punted to
+ * a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
+iopoll_locked:
+ ret2 = io_validate_ext_arg(flags, argp, argsz);
+ if (likely(!ret2)) {
+ min_complete = min(min_complete,
+ ctx->cq_entries);
+ ret2 = io_iopoll_check(ctx, min_complete);
+ }
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ const sigset_t __user *sig;
+ struct __kernel_timespec __user *ts;
+
+ ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ if (likely(!ret2)) {
+ min_complete = min(min_complete,
+ ctx->cq_entries);
+ ret2 = io_cqring_wait(ctx, min_complete, sig,
+ argsz, ts);
+ }
+ }
- min_complete = min(min_complete, ctx->cq_entries);
+ if (!ret) {
+ ret = ret2;
- /*
- * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
- * space applications don't need to do io completion events
- * polling again, they can rely on io_sq_thread to do polling
- * work, which can reduce cpu usage and uring_lock contention.
- */
- if (ctx->flags & IORING_SETUP_IOPOLL &&
- !(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_iopoll_check(ctx, min_complete);
- } else {
- ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
+ /*
+ * EBADR indicates that one or more CQE were dropped.
+ * Once the user has been informed we can clear the bit
+ * as they are obviously ok with those drops.
+ */
+ if (unlikely(ret2 == -EBADR))
+ clear_bit(IO_CHECK_CQ_DROPPED_BIT,
+ &ctx->check_cq);
}
}
@@ -10829,7 +12064,7 @@ out:
out_fput:
if (!(flags & IORING_ENTER_REGISTERED_RING))
fdput(f);
- return submitted ? submitted : ret;
+ return ret;
}
#ifdef CONFIG_PROC_FS
@@ -10876,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
unsigned int sq_tail = READ_ONCE(r->sq.tail);
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int cq_shift = 0;
unsigned int sq_entries, cq_entries;
bool has_lock;
+ bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
unsigned int i;
+ if (is_cqe32)
+ cq_shift = 1;
+
/*
* we may get imprecise sqe and cqe info if uring is actively running
* since we get cached_sq_head and cached_cq_tail without uring_lock
@@ -10912,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
for (i = 0; i < cq_entries; i++) {
unsigned int entry = i + cq_head;
- struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+ struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
- seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ if (!is_cqe32) {
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
+ } else {
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
+ "extra1:%llu, extra2:%llu\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
+ }
}
/*
@@ -11019,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
- size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
+ size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -11034,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->sq_ring_entries = p->sq_entries;
rings->cq_ring_entries = p->cq_entries;
- size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
+ if (p->flags & IORING_SETUP_SQE128)
+ size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
+ else
+ size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
io_mem_free(ctx->rings);
ctx->rings = NULL;
@@ -11146,11 +12396,41 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx = io_ring_ctx_alloc(p);
if (!ctx)
return -ENOMEM;
+
+ /*
+ * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
+ * space applications don't need to do io completion events
+ * polling again, they can rely on io_sq_thread to do polling
+ * work, which can reduce cpu usage and uring_lock contention.
+ */
+ if (ctx->flags & IORING_SETUP_IOPOLL &&
+ !(ctx->flags & IORING_SETUP_SQPOLL))
+ ctx->syscall_iopoll = 1;
+
ctx->compat = in_compat_syscall();
if (!capable(CAP_IPC_LOCK))
ctx->user = get_uid(current_user());
/*
+ * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
+ * COOP_TASKRUN is set, then IPIs are never needed by the app.
+ */
+ ret = -EINVAL;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ /* IPI related flags don't make sense with SQPOLL */
+ if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
+ IORING_SETUP_TASKRUN_FLAG))
+ goto err;
+ ctx->notify_method = TWA_SIGNAL_NO_IPI;
+ } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
+ ctx->notify_method = TWA_SIGNAL_NO_IPI;
+ } else {
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ goto err;
+ ctx->notify_method = TWA_SIGNAL;
+ }
+
+ /*
* This is just grabbed for accounting purposes. When a process exits,
* the mm is exited and dropped before the files, hence we need to hang
* on to this mm purely for the purposes of being able to unaccount
@@ -11247,10 +12527,12 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
+ IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
+ IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
return -EINVAL;
- return io_uring_create(entries, &p, params);
+ return io_uring_create(entries, &p, params);
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
@@ -11463,14 +12745,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
memset(&rr, 0, sizeof(rr));
if (copy_from_user(&rr, arg, size))
return -EFAULT;
- if (!rr.nr || rr.resv || rr.resv2)
+ if (!rr.nr || rr.resv2)
+ return -EINVAL;
+ if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
return -EINVAL;
switch (type) {
case IORING_RSRC_FILE:
+ if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+ break;
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
case IORING_RSRC_BUFFER:
+ if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+ break;
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
}
@@ -11605,6 +12893,85 @@ err:
return ret;
}
+static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_ring *br;
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+ struct page **pages;
+ int nr_pages;
+
+ if (copy_from_user(&reg, arg, sizeof(reg)))
+ return -EFAULT;
+
+ if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+ return -EINVAL;
+ if (!reg.ring_addr)
+ return -EFAULT;
+ if (reg.ring_addr & ~PAGE_MASK)
+ return -EINVAL;
+ if (!is_power_of_2(reg.ring_entries))
+ return -EINVAL;
+
+ if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
+ int ret = io_init_bl_list(ctx);
+ if (ret)
+ return ret;
+ }
+
+ bl = io_buffer_get_list(ctx, reg.bgid);
+ if (bl) {
+ /* if mapped buffer ring OR classic exists, don't allow */
+ if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
+ return -EEXIST;
+ } else {
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return -ENOMEM;
+ }
+
+ pages = io_pin_pages(reg.ring_addr,
+ struct_size(br, bufs, reg.ring_entries),
+ &nr_pages);
+ if (IS_ERR(pages)) {
+ kfree(bl);
+ return PTR_ERR(pages);
+ }
+
+ br = page_address(pages[0]);
+ bl->buf_pages = pages;
+ bl->buf_nr_pages = nr_pages;
+ bl->nr_entries = reg.ring_entries;
+ bl->buf_ring = br;
+ bl->mask = reg.ring_entries - 1;
+ io_buffer_add_list(ctx, bl, reg.bgid);
+ return 0;
+}
+
+static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+
+ if (copy_from_user(&reg, arg, sizeof(reg)))
+ return -EFAULT;
+ if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+ return -EINVAL;
+
+ bl = io_buffer_get_list(ctx, reg.bgid);
+ if (!bl)
+ return -ENOENT;
+ if (!bl->buf_nr_pages)
+ return -EINVAL;
+
+ __io_remove_buffers(ctx, bl, -1U);
+ if (bl->bgid >= BGID_ARRAY) {
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ kfree(bl);
+ }
+ return 0;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -11630,6 +12997,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
switch (opcode) {
case IORING_REGISTER_BUFFERS:
+ ret = -EFAULT;
+ if (!arg)
+ break;
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
@@ -11639,6 +13009,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
+ ret = -EFAULT;
+ if (!arg)
+ break;
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
@@ -11733,6 +13106,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_UNREGISTER_RING_FDS:
ret = io_ringfd_unregister(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_pbuf_ring(ctx, arg);
+ break;
+ case IORING_UNREGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_unregister_pbuf_ring(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
@@ -11809,6 +13194,7 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
+ BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
sizeof(struct io_uring_rsrc_update));
@@ -11817,6 +13203,10 @@ static int __init io_uring_init(void)
/* ->buf_index is u16 */
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+ BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
+ BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
+ BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
+ offsetof(struct io_uring_buf_ring, tail));
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
@@ -11826,6 +13216,10 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
+ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+
+ BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
+
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT);
return 0;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b08f5dc31780..80f9b047aa1b 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -56,7 +56,8 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI) {
+ /* Sync dio can't be polled reliably */
+ if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
bio_set_polled(bio, dio->iocb);
dio->submit.poll_bio = bio;
}
@@ -265,8 +266,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* cache flushes on IO completion.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) &&
- blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
use_fua = true;
}
@@ -654,9 +654,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!dio->submit.poll_bio ||
- !bio_poll(dio->submit.poll_bio, NULL, 0))
- blk_io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5b9408e3b370..ac7f067b7bdd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -488,7 +488,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_wait_updates(journal);
commit_transaction->t_state = T_SWITCH;
- write_unlock(&journal->j_state_lock);
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -508,6 +507,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
* buffer are perfectly permissible.
+ * We use journal->j_state_lock here to serialize processing of
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
@@ -527,6 +528,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_refile_buffer(journal, jh);
}
+ write_unlock(&journal->j_state_lock);
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fcacafa4510d..c0cbeeaec2d1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1762,7 +1762,6 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
unsigned long block, log_offset; /* logical */
unsigned long long phys_block, block_start, block_stop; /* physical */
loff_t byte_start, byte_stop, byte_count;
- struct request_queue *q = bdev_get_queue(journal->j_dev);
/* flags must be set to either discard or zeroout */
if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
@@ -1770,10 +1769,8 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
return -EINVAL;
- if (!q)
- return -ENXIO;
-
- if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(journal->j_dev))
return -EOPNOTSUPP;
/*
@@ -1828,7 +1825,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
err = blkdev_issue_discard(journal->j_dev,
byte_start >> SECTOR_SHIFT,
byte_count >> SECTOR_SHIFT,
- GFP_NOFS, 0);
+ GFP_NOFS);
} else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
err = blkdev_issue_zeroout(journal->j_dev,
byte_start >> SECTOR_SHIFT,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 03a845ab4f00..1e7b177ece60 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -110,14 +110,13 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
s64 ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q)) {
+ if (!bdev_max_discard_sectors(sb->s_bdev)) {
jfs_warn("FITRIM not supported on device");
return -EOPNOTSUPP;
}
@@ -127,7 +126,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
ret = jfs_ioc_trim(inode, &range);
if (ret < 0)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index f1a13a74cddf..85d4f44f2ac4 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -372,19 +372,16 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
}
case Opt_discard:
- {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
/* if set to 1, even copying files will cause
* trimming :O
* -> user has more control over the online trimming
*/
sbi->minblks_trim = 64;
- if (blk_queue_discard(q))
+ if (bdev_max_discard_sectors(sb->s_bdev))
*flag |= JFS_DISCARD;
else
pr_err("JFS: discard option not supported on device\n");
break;
- }
case Opt_nodiscard:
*flag &= ~JFS_DISCARD;
@@ -392,10 +389,9 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_discard_minblk:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
char *minblks_trim = args[0].from;
int rc;
- if (blk_queue_discard(q)) {
+ if (bdev_max_discard_sectors(sb->s_bdev)) {
*flag |= JFS_DISCARD;
rc = kstrtouint(minblks_trim, 0,
&sbi->minblks_trim);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 61a8edc4ba8b..e205fde7163a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1406,7 +1406,12 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_root *root = kernfs_root(kn);
+ struct kernfs_root *root;
+
+ if (!kn)
+ return;
+
+ root = kernfs_root(kn);
down_write(&root->kernfs_rwsem);
__kernfs_remove(kn);
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index 60e7ac62c917..1e2076a53bed 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -158,19 +158,41 @@ out:
* Return : windows path string or error
*/
-char *convert_to_nt_pathname(char *filename)
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ struct path *path)
{
- char *ab_pathname;
+ char *pathname, *ab_pathname, *nt_pathname;
+ int share_path_len = share->path_sz;
- if (strlen(filename) == 0)
- filename = "\\";
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return ERR_PTR(-EACCES);
- ab_pathname = kstrdup(filename, GFP_KERNEL);
- if (!ab_pathname)
- return NULL;
+ ab_pathname = d_path(path, pathname, PATH_MAX);
+ if (IS_ERR(ab_pathname)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ if (strncmp(ab_pathname, share->path, share_path_len)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, GFP_KERNEL);
+ if (!nt_pathname) {
+ nt_pathname = ERR_PTR(-ENOMEM);
+ goto free_pathname;
+ }
+ if (ab_pathname[share_path_len] == '\0')
+ strcpy(nt_pathname, "/");
+ strcat(nt_pathname, &ab_pathname[share_path_len]);
+
+ ksmbd_conv_path_to_windows(nt_pathname);
- ksmbd_conv_path_to_windows(ab_pathname);
- return ab_pathname;
+free_pathname:
+ kfree(pathname);
+ return nt_pathname;
}
int get_nlink(struct kstat *st)
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
index 253366bd0951..aae2a252945f 100644
--- a/fs/ksmbd/misc.h
+++ b/fs/ksmbd/misc.h
@@ -14,7 +14,8 @@ struct ksmbd_file;
int match_pattern(const char *str, size_t len, const char *pattern);
int ksmbd_validate_filename(char *filename);
int parse_stream_name(char *filename, char **stream_name, int *s_type);
-char *convert_to_nt_pathname(char *filename);
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ struct path *path);
int get_nlink(struct kstat *st);
void ksmbd_conv_path_to_unix(char *path);
void ksmbd_strip_last_slash(char *path);
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 23871b18a429..8b5560574d4c 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -1694,33 +1694,3 @@ out:
read_unlock(&lease_list_lock);
return ret_op;
}
-
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name)
-{
- struct oplock_info *opinfo = opinfo_get(fp);
- int ret = 0;
-
- if (opinfo && opinfo->is_lease) {
- if (!lctx) {
- pr_err("open does not include lease\n");
- ret = -EBADF;
- goto out;
- }
- if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- pr_err("invalid lease key\n");
- ret = -EBADF;
- goto out;
- }
- if (name && strcmp(fp->filename, name)) {
- pr_err("invalid name reconnect %s\n", name);
- ret = -EINVAL;
- goto out;
- }
- }
-out:
- if (opinfo)
- opinfo_put(opinfo);
- return ret;
-}
diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h
index 0cf7a2b5bbc0..09753448f779 100644
--- a/fs/ksmbd/oplock.h
+++ b/fs/ksmbd/oplock.h
@@ -124,6 +124,4 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
struct lease_ctx_info *lctx);
void destroy_lease_table(struct ksmbd_conn *conn);
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name);
#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 3bf6c56c654c..16c803a9d996 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -11,6 +11,7 @@
#include <linux/statfs.h>
#include <linux/ethtool.h>
#include <linux/falloc.h>
+#include <linux/mount.h>
#include "glob.h"
#include "smbfsctl.h"
@@ -2918,7 +2919,6 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
}
- fp->filename = name;
fp->cdoption = req->CreateDisposition;
fp->daccess = daccess;
fp->saccess = req->ShareAccess;
@@ -3270,14 +3270,13 @@ err_out1:
if (!rsp->hdr.Status)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
- if (!fp || !fp->filename)
- kfree(name);
if (fp)
ksmbd_fd_put(work, fp);
smb2_set_err_rsp(work);
ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status);
}
+ kfree(name);
kfree(lc);
return 0;
@@ -3895,8 +3894,6 @@ int smb2_query_dir(struct ksmbd_work *work)
ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr);
}
- ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename);
-
if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) {
ksmbd_debug(SMB, "Restart directory scan\n");
generic_file_llseek(dir_fp->filp, 0, SEEK_SET);
@@ -4390,9 +4387,9 @@ static int get_file_all_info(struct ksmbd_work *work,
return -EACCES;
}
- filename = convert_to_nt_pathname(fp->filename);
- if (!filename)
- return -ENOMEM;
+ filename = convert_to_nt_pathname(work->tcon->share_conf, &fp->filp->f_path);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
inode = file_inode(fp->filp);
generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat);
@@ -4999,15 +4996,17 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
case FS_SECTOR_SIZE_INFORMATION:
{
struct smb3_fs_ss_info *info;
+ unsigned int sector_size =
+ min_t(unsigned int, path.mnt->mnt_sb->s_blocksize, 4096);
info = (struct smb3_fs_ss_info *)(rsp->Buffer);
- info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize);
+ info->LogicalBytesPerSector = cpu_to_le32(sector_size);
info->PhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
- info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
+ info->PhysicalBytesPerSectorForPerf = cpu_to_le32(sector_size);
info->FSEffPhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE |
SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE);
info->ByteOffsetForSectorAlignment = 0;
@@ -5683,8 +5682,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
size = i_size_read(inode);
rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512);
if (rc) {
- pr_err("truncate failed! filename : %s, err %d\n",
- fp->filename, rc);
+ pr_err("truncate failed!, err %d\n", rc);
return rc;
}
if (size < alloc_blks * 512)
@@ -5714,12 +5712,10 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
* truncated range.
*/
if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
- ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n",
- fp->filename, newsize);
+ ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize);
rc = ksmbd_vfs_truncate(work, fp, newsize);
if (rc) {
- ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n",
- fp->filename, rc);
+ ksmbd_debug(SMB, "truncate failed!, err %d\n", rc);
if (rc != -EAGAIN)
rc = -EBADF;
return rc;
@@ -5765,8 +5761,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
if (parent_fp) {
if (parent_fp->daccess & FILE_DELETE_LE) {
pr_err("parent dir is opened with delete access\n");
+ ksmbd_fd_put(work, parent_fp);
return -ESHARE;
}
+ ksmbd_fd_put(work, parent_fp);
}
next:
return smb2_rename(work, fp, user_ns, rename_info,
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 9cebb6ba555b..dcdd07c6efff 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -398,8 +398,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
nbytes = kernel_read(filp, rbuf, count, pos);
if (nbytes < 0) {
- pr_err("smb read failed for (%s), err = %zd\n",
- fp->filename, nbytes);
+ pr_err("smb read failed, err = %zd\n", nbytes);
return nbytes;
}
@@ -875,8 +874,7 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work,
err = vfs_truncate(&filp->f_path, size);
if (err)
- pr_err("truncate failed for filename : %s err %d\n",
- fp->filename, err);
+ pr_err("truncate failed, err %d\n", err);
return err;
}
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 29c1db66bd0f..c4d59d2735f0 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -328,7 +328,6 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
kfree(smb_lock);
}
- kfree(fp->filename);
if (ksmbd_stream_fd(fp))
kfree(fp->stream.name);
kmem_cache_free(filp_cache, fp);
@@ -497,6 +496,7 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode)
list_for_each_entry(lfp, &ci->m_fp_list, node) {
if (inode == file_inode(lfp->filp)) {
atomic_dec(&ci->m_count);
+ lfp = ksmbd_fp_get(lfp);
read_unlock(&ci->m_lock);
return lfp;
}
diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h
index 36239ce31afd..fcb13413fa8d 100644
--- a/fs/ksmbd/vfs_cache.h
+++ b/fs/ksmbd/vfs_cache.h
@@ -62,7 +62,6 @@ struct ksmbd_inode {
struct ksmbd_file {
struct file *filp;
- char *filename;
u64 persistent_id;
u64 volatile_id;
diff --git a/fs/namespace.c b/fs/namespace.c
index a0a36bfa3aa0..afe2b64b14f1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4058,10 +4058,22 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
if (err) {
struct mount *p;
- for (p = mnt; p != m; p = next_mnt(p, mnt)) {
+ /*
+ * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
+ * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
+ * mounts and needs to take care to include the first mount.
+ */
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
/* If we had to hold writers unblock them. */
if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt_unhold_writers(p);
+
+ /*
+ * We're done once the first mount we changed got
+ * MNT_WRITE_HOLD unset.
+ */
+ if (p == m)
+ break;
}
}
return err;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e2d59bb5e6bb..9a16897e8dc6 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
if (result.negated)
ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
else
- ctx->flags &= NFS_MOUNT_SOFTREVAL;
+ ctx->flags |= NFS_MOUNT_SOFTREVAL;
break;
case Opt_posix:
if (result.negated)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 16106f805ffa..a79f66432bd3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
static void nfs4_test_and_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
const struct cred *cred)
@@ -6553,7 +6561,9 @@ static void nfs4_delegreturn_release(void *calldata)
pnfs_roc_release(&data->lr.arg, &data->lr.res,
data->res.lr_ret);
if (inode) {
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fec194a666f4..87e1004b606d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1052,20 +1052,20 @@ out:
static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
{
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
- struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
struct fstrim_range range;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(nilfs->ns_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
+ range.minlen = max_t(u64, range.minlen,
+ bdev_discard_granularity(nilfs->ns_bdev));
down_read(&nilfs->ns_segctor_sem);
ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index e385cca2004a..77ff8e95421f 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1100,7 +1100,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (ret < 0) {
put_bh(su_bh);
goto out_sem;
@@ -1134,7 +1134,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
ndiscarded += nblocks;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index dd48a8f74d57..3b4a079c9617 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -672,7 +672,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (ret < 0)
return ret;
nblocks = 0;
@@ -682,7 +682,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
return ret;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9b32b76a9c30..a792e21c5309 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1657,6 +1657,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
else
mnt = path.mnt;
+ /*
+ * FAN_RENAME is not allowed on non-dir (for now).
+ * We shouldn't have allowed setting any dirent events in mask of
+ * non-dir, but because we always allowed it, error only if group
+ * was initialized with the new flag FAN_REPORT_TARGET_FID.
+ */
+ ret = -ENOTDIR;
+ if (inode && !S_ISDIR(inode->i_mode) &&
+ ((mask & FAN_RENAME) ||
+ ((mask & FANOTIFY_DIRENT_EVENTS) &&
+ FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID))))
+ goto path_put_and_out;
+
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 787b53b984ee..15806eeae217 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -22,20 +22,20 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
{
struct fstrim_range __user *user_range;
struct fstrim_range range;
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sbi->sb->s_bdev))
return -EOPNOTSUPP;
user_range = (struct fstrim_range __user *)arg;
if (copy_from_user(&range, user_range, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u32, range.minlen, q->limits.discard_granularity);
+ range.minlen = max_t(u32, range.minlen,
+ bdev_discard_granularity(sbi->sb->s_bdev));
err = ntfs_trim_fs(sbi, &range);
if (err < 0)
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 278dcf502410..5781b9e8e3d8 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -882,7 +882,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
int err;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct block_device *bdev = sb->s_bdev;
- struct request_queue *rq;
struct inode *inode;
struct ntfs_inode *ni;
size_t i, tt;
@@ -912,15 +911,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto out;
}
- rq = bdev_get_queue(bdev);
- if (blk_queue_discard(rq) && rq->limits.discard_granularity) {
- sbi->discard_granularity = rq->limits.discard_granularity;
+ if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) {
+ sbi->discard_granularity = bdev_discard_granularity(bdev);
sbi->discard_granularity_mask_inv =
~(u64)(sbi->discard_granularity - 1);
}
/* Parse boot. */
- err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
+ err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev),
bdev_nr_bytes(bdev));
if (err)
goto out;
@@ -1335,7 +1333,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
return 0;
err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (err == -EOPNOTSUPP)
sbi->flags |= NTFS_FLAGS_NODISCARD;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f59461d85da4..afd54ec66103 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -903,20 +903,19 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev),
range.minlen);
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
diff --git a/fs/pipe.c b/fs/pipe.c
index 9648ac15164a..e140ea150bbb 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -804,7 +804,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
- pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer),
+ pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
if (pipe->bufs) {
@@ -849,7 +849,7 @@ void free_pipe_info(struct pipe_inode_info *pipe)
#endif
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
- kvfree(pipe->bufs);
+ kfree(pipe->bufs);
kfree(pipe);
}
@@ -1264,7 +1264,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
if (nr_slots < n)
return -EBUSY;
- bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT);
+ bufs = kcalloc(nr_slots, sizeof(*bufs),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
@@ -1291,7 +1292,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
head = n;
tail = 0;
- kvfree(pipe->bufs);
+ kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
if (pipe->max_usage > nr_slots)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 80acb6885cf9..962d32468eb4 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -759,9 +759,14 @@ static void posix_acl_fix_xattr_userns(
}
void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
+
+ /* Leave ids untouched on non-idmapped mounts. */
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
+ mnt_userns = &init_user_ns;
if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
return;
posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
@@ -769,9 +774,14 @@ void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
}
void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
+
+ /* Leave ids untouched on non-idmapped mounts. */
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
+ mnt_userns = &init_user_ns;
if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
return;
posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 419760fd77bd..f38bda5b83ec 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b31..913bef0d2a36 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 622c844f6d11..8879d052f96c 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -86,17 +86,10 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_VECS) {
- bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
- } else {
- bio = bio_kmalloc(GFP_NOIO, page_count);
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_opf = REQ_OP_READ;
- }
-
+ bio = bio_kmalloc(page_count, GFP_NOIO);
if (!bio)
return -ENOMEM;
-
+ bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ);
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
@@ -126,7 +119,8 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
out_free_bio:
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
return error;
}
@@ -190,7 +184,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
length |= data[0] << 8;
}
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
compressed = SQUASHFS_COMPRESSED(length);
length = SQUASHFS_COMPRESSED_SIZE(length);
@@ -224,7 +219,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
out_free_bio:
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
out:
if (res < 0) {
ERROR("Failed to read block 0x%llx: %d\n", index, res);
diff --git a/fs/super.c b/fs/super.c
index f1d4a193602d..60f57c7bc0a6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1204,7 +1204,7 @@ static int set_bdev_super(struct super_block *s, void *data)
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
- if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
+ if (bdev_stable_writes(s->s_bdev))
s->s_iflags |= SB_I_STABLE_WRITES;
return 0;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 0ed4861b038f..b3d5f97f16cd 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
if (fileident) {
if (adinicb || (offset + lfi < 0)) {
- memcpy(udf_get_fi_ident(sfi), fileident, lfi);
+ memcpy(sfi->impUse + liu, fileident, lfi);
} else if (offset >= 0) {
memcpy(fibh->ebh->b_data + offset, fileident, lfi);
} else {
- memcpy(udf_get_fi_ident(sfi), fileident, -offset);
+ memcpy(sfi->impUse + liu, fileident, -offset);
memcpy(fibh->ebh->b_data, fileident - offset,
lfi + offset);
}
@@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
offset += lfi;
if (adinicb || (offset + padlen < 0)) {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen);
+ memset(sfi->impUse + liu + lfi, 0x00, padlen);
} else if (offset >= 0) {
memset(fibh->ebh->b_data + offset, 0x00, padlen);
} else {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset);
+ memset(sfi->impUse + liu + lfi, 0x00, -offset);
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 5c8c5175b385..e8dd03e4561e 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -25,6 +25,8 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
@@ -539,43 +541,76 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
/*
* Extended attribute SET operations
*/
-static long
-setxattr(struct user_namespace *mnt_userns, struct dentry *d,
- const char __user *name, const void __user *value, size_t size,
- int flags)
+
+int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
{
int error;
- void *kvalue = NULL;
- char kname[XATTR_NAME_MAX + 1];
- if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
+ if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
return -EINVAL;
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
+ error = strncpy_from_user(ctx->kname->name, name,
+ sizeof(ctx->kname->name));
+ if (error == 0 || error == sizeof(ctx->kname->name))
+ return -ERANGE;
if (error < 0)
return error;
- if (size) {
- if (size > XATTR_SIZE_MAX)
+ error = 0;
+ if (ctx->size) {
+ if (ctx->size > XATTR_SIZE_MAX)
return -E2BIG;
- kvalue = kvmalloc(size, GFP_KERNEL);
- if (!kvalue)
- return -ENOMEM;
- if (copy_from_user(kvalue, value, size)) {
- error = -EFAULT;
- goto out;
+
+ ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size);
+ if (IS_ERR(ctx->kvalue)) {
+ error = PTR_ERR(ctx->kvalue);
+ ctx->kvalue = NULL;
}
- if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size);
}
- error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags);
-out:
- kvfree(kvalue);
+ return error;
+}
+
+static void setxattr_convert(struct user_namespace *mnt_userns,
+ struct dentry *d, struct xattr_ctx *ctx)
+{
+ if (ctx->size &&
+ ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
+ (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)))
+ posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d),
+ ctx->kvalue, ctx->size);
+}
+
+int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct xattr_ctx *ctx)
+{
+ setxattr_convert(mnt_userns, dentry, ctx);
+ return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
+ ctx->kvalue, ctx->size, ctx->flags);
+}
+
+static long
+setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ const char __user *name, const void __user *value, size_t size,
+ int flags)
+{
+ struct xattr_name kname;
+ struct xattr_ctx ctx = {
+ .cvalue = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = flags,
+ };
+ int error;
+ error = setxattr_copy(name, &ctx);
+ if (error)
+ return error;
+
+ error = do_setxattr(mnt_userns, d, &ctx);
+
+ kvfree(ctx.kvalue);
return error;
}
@@ -641,43 +676,61 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
/*
* Extended attribute GET operations
*/
-static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
- const char __user *name, void __user *value, size_t size)
+ssize_t
+do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ struct xattr_ctx *ctx)
{
ssize_t error;
- void *kvalue = NULL;
- char kname[XATTR_NAME_MAX + 1];
-
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
+ char *kname = ctx->kname->name;
- if (size) {
- if (size > XATTR_SIZE_MAX)
- size = XATTR_SIZE_MAX;
- kvalue = kvzalloc(size, GFP_KERNEL);
- if (!kvalue)
+ if (ctx->size) {
+ if (ctx->size > XATTR_SIZE_MAX)
+ ctx->size = XATTR_SIZE_MAX;
+ ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL);
+ if (!ctx->kvalue)
return -ENOMEM;
}
- error = vfs_getxattr(mnt_userns, d, kname, kvalue, size);
+ error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
if (error > 0) {
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error);
- if (size && copy_to_user(value, kvalue, error))
+ posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d),
+ ctx->kvalue, error);
+ if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
error = -EFAULT;
- } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+ } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
/* The file system tried to returned a value bigger
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
- kvfree(kvalue);
+ return error;
+}
+
+static ssize_t
+getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ const char __user *name, void __user *value, size_t size)
+{
+ ssize_t error;
+ struct xattr_name kname;
+ struct xattr_ctx ctx = {
+ .value = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = 0,
+ };
+
+ error = strncpy_from_user(kname.name, name, sizeof(kname.name));
+ if (error == 0 || error == sizeof(kname.name))
+ error = -ERANGE;
+ if (error < 0)
+ return error;
+
+ error = do_getxattr(mnt_userns, d, &ctx);
+ kvfree(ctx.kvalue);
return error;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e1afb9e503e1..bf4e60871068 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -406,7 +406,7 @@ xfs_buf_alloc_pages(
STATIC int
_xfs_buf_map_pages(
struct xfs_buf *bp,
- uint flags)
+ xfs_buf_flags_t flags)
{
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
@@ -868,7 +868,7 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@@ -903,7 +903,7 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
int error;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index edcb6254fa6a..1ee3056ff9cf 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -22,28 +22,28 @@ struct xfs_buf;
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define XBF_READ (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
-#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
+#define XBF_READ (1u << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
+#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
/* buffer type flags for write callbacks */
-#define _XBF_INODES (1 << 16)/* inode buffer */
-#define _XBF_DQUOTS (1 << 17)/* dquot buffer */
-#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */
+#define _XBF_INODES (1u << 16)/* inode buffer */
+#define _XBF_DQUOTS (1u << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
-#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1u << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */
+#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -58,7 +58,7 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_INODES, "INODES" }, \
{ _XBF_DQUOTS, "DQUOTS" }, \
- { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
+ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
@@ -247,11 +247,11 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
- struct xfs_buf **bpp);
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
- size_t numblks, int flags, struct xfs_buf **bpp,
- const struct xfs_buf_ops *ops);
+ size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops);
int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 0191de8ce9ce..c6fe3f6ebb6b 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -114,7 +114,7 @@ xfs_trim_extents(
}
trace_xfs_discard_extent(mp, agno, fbno, flen);
- error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+ error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
if (error)
goto out_del_cursor;
*blocks_trimmed += flen;
@@ -152,8 +152,8 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
- unsigned int granularity = q->limits.discard_granularity;
+ unsigned int granularity =
+ bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
@@ -162,7 +162,7 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev))
return -EOPNOTSUPP;
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9de6205fe134..39ae53efb3ab 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2594,14 +2594,13 @@ xfs_ifree_cluster(
}
/*
- * This is called to return an inode to the inode free list.
- * The inode should already be truncated to 0 length and have
- * no pages associated with it. This routine also assumes that
- * the inode is already a part of the transaction.
+ * This is called to return an inode to the inode free list. The inode should
+ * already be truncated to 0 length and have no pages associated with it. This
+ * routine also assumes that the inode is already a part of the transaction.
*
- * The on-disk copy of the inode will have been added to the list
- * of unlinked inodes in the AGI. We need to remove the inode from
- * that list atomically with respect to freeing it here.
+ * The on-disk copy of the inode will have been added to the list of unlinked
+ * inodes in the AGI. We need to remove the inode from that list atomically with
+ * respect to freeing it here.
*/
int
xfs_ifree(
@@ -2623,13 +2622,16 @@ xfs_ifree(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/*
- * Pull the on-disk inode from the AGI unlinked list.
+ * Free the inode first so that we guarantee that the AGI lock is going
+ * to be taken before we remove the inode from the unlinked list. This
+ * makes the AGI lock -> unlinked list modification order the same as
+ * used in O_TMPFILE creation.
*/
- error = xfs_iunlink_remove(tp, pag, ip);
+ error = xfs_difree(tp, pag, ip->i_ino, &xic);
if (error)
- goto out;
+ return error;
- error = xfs_difree(tp, pag, ip->i_ino, &xic);
+ error = xfs_iunlink_remove(tp, pag, ip);
if (error)
goto out;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ba57323bfdce..c9f55e4f0957 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -605,7 +605,7 @@ xlog_discard_busy_extents(
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, 0, &bio);
+ GFP_NOFS, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 54be9d64093e..a276b8111f63 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1608,14 +1608,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
- if (xfs_has_discard(mp)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- xfs_warn(mp, "mounting with \"discard\" option, but "
- "the device does not support discard");
- mp->m_features &= ~XFS_FEAT_DISCARD;
- }
+ if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) {
+ xfs_warn(mp,
+ "mounting with \"discard\" option, but the device does not support discard");
+ mp->m_features &= ~XFS_FEAT_DISCARD;
}
if (xfs_has_reflink(mp)) {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index de177842b951..0c82673238f4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -175,7 +175,7 @@ xfs_trans_get_buf(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
int numblks,
- uint flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile
index 33c1a4f1132e..9fe54f5319f2 100644
--- a/fs/zonefs/Makefile
+++ b/fs/zonefs/Makefile
@@ -3,4 +3,4 @@ ccflags-y += -I$(src)
obj-$(CONFIG_ZONEFS_FS) += zonefs.o
-zonefs-y := super.o
+zonefs-y := super.o sysfs.o
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 3614c7834007..b3b0b71fdf6c 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -27,6 +27,39 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
+/*
+ * Manage the active zone count. Called with zi->i_truncate_mutex held.
+ */
+static void zonefs_account_active(struct inode *inode)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ lockdep_assert_held(&zi->i_truncate_mutex);
+
+ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+ return;
+
+ /*
+ * If the zone is active, that is, if it is explicitly open or
+ * partially written, check if it was already accounted as active.
+ */
+ if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
+ (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
+ if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
+ zi->i_flags |= ZONEFS_ZONE_ACTIVE;
+ atomic_inc(&sbi->s_active_seq_files);
+ }
+ return;
+ }
+
+ /* The zone is not active. If it was, update the active count */
+ if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
+ zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
+ atomic_dec(&sbi->s_active_seq_files);
+ }
+}
+
static inline int zonefs_zone_mgmt(struct inode *inode,
enum req_opf op)
{
@@ -35,6 +68,17 @@ static inline int zonefs_zone_mgmt(struct inode *inode,
lockdep_assert_held(&zi->i_truncate_mutex);
+ /*
+ * With ZNS drives, closing an explicitly open zone that has not been
+ * written will change the zone state to "closed", that is, the zone
+ * will remain active. Since this can then cause failure of explicit
+ * open operation on other zones if the drive active zone resources
+ * are exceeded, make sure that the zone does not remain active by
+ * resetting it.
+ */
+ if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
+ op = REQ_OP_ZONE_RESET;
+
trace_zonefs_zone_mgmt(inode, op);
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
@@ -57,8 +101,13 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
* A full zone is no longer open/active and does not need
* explicit closing.
*/
- if (isize >= zi->i_max_size)
- zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+ if (isize >= zi->i_max_size) {
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+
+ if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
+ atomic_dec(&sbi->s_active_seq_files);
+ zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
+ }
}
static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -386,6 +435,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_update_stats(inode, data_size);
zonefs_i_size_write(inode, data_size);
zi->i_wpoffset = data_size;
+ zonefs_account_active(inode);
return 0;
}
@@ -497,6 +547,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
zonefs_update_stats(inode, isize);
truncate_setsize(inode, isize);
zi->i_wpoffset = isize;
+ zonefs_account_active(inode);
unlock:
mutex_unlock(&zi->i_truncate_mutex);
@@ -678,13 +729,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int max;
+ unsigned int max = bdev_max_zone_append_sectors(bdev);
struct bio *bio;
ssize_t size;
int nr_pages;
ssize_t ret;
- max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
iov_iter_truncate(from, max);
@@ -855,8 +905,15 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
count = ret;
+
+ /*
+ * Update the zone write pointer offset assuming the write
+ * operation succeeded. If it did not, the error recovery path
+ * will correct it. Also do active seq file accounting.
+ */
mutex_lock(&zi->i_truncate_mutex);
zi->i_wpoffset += count;
+ zonefs_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -998,13 +1055,13 @@ inode_unlock:
return ret;
}
-static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
+/*
+ * Write open accounting is done only for sequential files.
+ */
+static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+ struct file *file)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
-
- if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
- return false;
if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
return false;
@@ -1015,28 +1072,34 @@ static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *fi
return true;
}
-static int zonefs_open_zone(struct inode *inode)
+static int zonefs_seq_file_write_open(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
if (!zi->i_wr_refcnt) {
- if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
- atomic_dec(&sbi->s_open_zones);
- ret = -EBUSY;
- goto unlock;
- }
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
- if (i_size_read(inode) < zi->i_max_size) {
- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
- if (ret) {
- atomic_dec(&sbi->s_open_zones);
+ if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+
+ if (wro > sbi->s_max_wro_seq_files) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ ret = -EBUSY;
goto unlock;
}
- zi->i_flags |= ZONEFS_ZONE_OPEN;
+
+ if (i_size_read(inode) < zi->i_max_size) {
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ if (ret) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ goto unlock;
+ }
+ zi->i_flags |= ZONEFS_ZONE_OPEN;
+ zonefs_account_active(inode);
+ }
}
}
@@ -1056,30 +1119,31 @@ static int zonefs_file_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- if (zonefs_file_use_exp_open(inode, file))
- return zonefs_open_zone(inode);
+ if (zonefs_seq_file_need_wro(inode, file))
+ return zonefs_seq_file_write_open(inode);
return 0;
}
-static void zonefs_close_zone(struct inode *inode)
+static void zonefs_seq_file_write_close(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
- zi->i_wr_refcnt--;
- if (!zi->i_wr_refcnt) {
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- struct super_block *sb = inode->i_sb;
- /*
- * If the file zone is full, it is not open anymore and we only
- * need to decrement the open count.
- */
- if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
- goto dec;
+ zi->i_wr_refcnt--;
+ if (zi->i_wr_refcnt)
+ goto unlock;
+ /*
+ * The file zone may not be open anymore (e.g. the file was truncated to
+ * its maximum size or it was fully written). For this case, we only
+ * need to decrement the write open count.
+ */
+ if (zi->i_flags & ZONEFS_ZONE_OPEN) {
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
if (ret) {
__zonefs_io_error(inode, false);
@@ -1091,14 +1155,23 @@ static void zonefs_close_zone(struct inode *inode)
*/
if (zi->i_flags & ZONEFS_ZONE_OPEN &&
!(sb->s_flags & SB_RDONLY)) {
- zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
+ zonefs_warn(sb,
+ "closing zone at %llu failed %d\n",
+ zi->i_zsector, ret);
+ zonefs_warn(sb,
+ "remounting filesystem read-only\n");
sb->s_flags |= SB_RDONLY;
}
+ goto unlock;
}
+
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
-dec:
- atomic_dec(&sbi->s_open_zones);
+ zonefs_account_active(inode);
}
+
+ atomic_dec(&sbi->s_wro_seq_files);
+
+unlock:
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -1110,8 +1183,8 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
* the zone has gone offline or read-only). Make sure we don't fail the
* close(2) for user-space.
*/
- if (zonefs_file_use_exp_open(inode, file))
- zonefs_close_zone(inode);
+ if (zonefs_seq_file_need_wro(inode, file))
+ zonefs_seq_file_write_close(inode);
return 0;
}
@@ -1142,6 +1215,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
zi->i_wr_refcnt = 0;
+ zi->i_flags = 0;
return &zi->i_vnode;
}
@@ -1293,12 +1367,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
inc_nlink(parent);
}
-static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
- enum zonefs_ztype type)
+static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+ enum zonefs_ztype type)
{
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret;
inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
inode->i_mode = S_IFREG | sbi->s_perm;
@@ -1323,6 +1398,29 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+
+ mutex_lock(&zi->i_truncate_mutex);
+
+ /*
+ * For sequential zones, make sure that any open zone is closed first
+ * to ensure that the initial number of open zones is 0, in sync with
+ * the open zone accounting done when the mount option
+ * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+ */
+ if (type == ZONEFS_ZTYPE_SEQ &&
+ (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+ zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ if (ret)
+ goto unlock;
+ }
+
+ zonefs_account_active(inode);
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ return 0;
}
static struct dentry *zonefs_create_inode(struct dentry *parent,
@@ -1332,6 +1430,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
struct inode *dir = d_inode(parent);
struct dentry *dentry;
struct inode *inode;
+ int ret;
dentry = d_alloc_name(parent, name);
if (!dentry)
@@ -1342,10 +1441,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
goto dput;
inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
- if (zone)
- zonefs_init_file_inode(inode, zone, type);
- else
+ if (zone) {
+ ret = zonefs_init_file_inode(inode, zone, type);
+ if (ret) {
+ iput(inode);
+ goto dput;
+ }
+ } else {
zonefs_init_dir_inode(dir, inode, type);
+ }
+
d_add(dentry, inode);
dir->i_size++;
@@ -1652,14 +1757,18 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_gid = GLOBAL_ROOT_GID;
sbi->s_perm = 0640;
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
- sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
- atomic_set(&sbi->s_open_zones, 0);
- if (!sbi->s_max_open_zones &&
+
+ atomic_set(&sbi->s_wro_seq_files, 0);
+ sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
+ if (!sbi->s_max_wro_seq_files &&
sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
}
+ atomic_set(&sbi->s_active_seq_files, 0);
+ sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
+
ret = zonefs_read_super(sb);
if (ret)
return ret;
@@ -1674,6 +1783,10 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
if (ret)
goto cleanup;
+ ret = zonefs_sysfs_register(sb);
+ if (ret)
+ goto cleanup;
+
zonefs_info(sb, "Mounting %u zones",
blkdev_nr_zones(sb->s_bdev->bd_disk));
@@ -1719,6 +1832,8 @@ static void zonefs_kill_super(struct super_block *sb)
if (sb->s_root)
d_genocide(sb->s_root);
+
+ zonefs_sysfs_unregister(sb);
kill_block_super(sb);
kfree(sbi);
}
@@ -1766,16 +1881,26 @@ static int __init zonefs_init(void)
return ret;
ret = register_filesystem(&zonefs_type);
- if (ret) {
- zonefs_destroy_inodecache();
- return ret;
- }
+ if (ret)
+ goto destroy_inodecache;
+
+ ret = zonefs_sysfs_init();
+ if (ret)
+ goto unregister_fs;
return 0;
+
+unregister_fs:
+ unregister_filesystem(&zonefs_type);
+destroy_inodecache:
+ zonefs_destroy_inodecache();
+
+ return ret;
}
static void __exit zonefs_exit(void)
{
+ zonefs_sysfs_exit();
zonefs_destroy_inodecache();
unregister_filesystem(&zonefs_type);
}
diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c
new file mode 100644
index 000000000000..9cb6755ce39a
--- /dev/null
+++ b/fs/zonefs/sysfs.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Simple file system for zoned block devices exposing zones as files.
+ *
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+
+#include "zonefs.h"
+
+struct zonefs_sysfs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf);
+};
+
+static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr)
+{
+ return container_of(attr, struct zonefs_sysfs_attr, attr);
+}
+
+#define ZONEFS_SYSFS_ATTR_RO(name) \
+static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name)
+
+#define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr
+
+static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct zonefs_sb_info *sbi =
+ container_of(kobj, struct zonefs_sb_info, s_kobj);
+ struct zonefs_sysfs_attr *zonefs_attr =
+ container_of(attr, struct zonefs_sysfs_attr, attr);
+
+ if (!zonefs_attr->show)
+ return 0;
+
+ return zonefs_attr->show(sbi, buf);
+}
+
+static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files);
+}
+ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files);
+
+static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files));
+}
+ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files);
+
+static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files);
+}
+ZONEFS_SYSFS_ATTR_RO(max_active_seq_files);
+
+static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files));
+}
+ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files);
+
+static struct attribute *zonefs_sysfs_attrs[] = {
+ ATTR_LIST(max_wro_seq_files),
+ ATTR_LIST(nr_wro_seq_files),
+ ATTR_LIST(max_active_seq_files),
+ ATTR_LIST(nr_active_seq_files),
+ NULL,
+};
+ATTRIBUTE_GROUPS(zonefs_sysfs);
+
+static void zonefs_sysfs_sb_release(struct kobject *kobj)
+{
+ struct zonefs_sb_info *sbi =
+ container_of(kobj, struct zonefs_sb_info, s_kobj);
+
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops zonefs_sysfs_attr_ops = {
+ .show = zonefs_sysfs_attr_show,
+};
+
+static struct kobj_type zonefs_sb_ktype = {
+ .default_groups = zonefs_sysfs_groups,
+ .sysfs_ops = &zonefs_sysfs_attr_ops,
+ .release = zonefs_sysfs_sb_release,
+};
+
+static struct kobject *zonefs_sysfs_root;
+
+int zonefs_sysfs_register(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ int ret;
+
+ init_completion(&sbi->s_kobj_unregister);
+ ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype,
+ zonefs_sysfs_root, "%s", sb->s_id);
+ if (ret) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ return ret;
+ }
+
+ sbi->s_sysfs_registered = true;
+
+ return 0;
+}
+
+void zonefs_sysfs_unregister(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+
+ if (!sbi || !sbi->s_sysfs_registered)
+ return;
+
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+}
+
+int __init zonefs_sysfs_init(void)
+{
+ zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj);
+ if (!zonefs_sysfs_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void zonefs_sysfs_exit(void)
+{
+ kobject_put(zonefs_sysfs_root);
+ zonefs_sysfs_root = NULL;
+}
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 7b147907c328..4b3de66c3233 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -12,6 +12,7 @@
#include <linux/uuid.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
+#include <linux/kobject.h>
/*
* Maximum length of file names: this only needs to be large enough to fit
@@ -39,6 +40,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
}
#define ZONEFS_ZONE_OPEN (1 << 0)
+#define ZONEFS_ZONE_ACTIVE (1 << 1)
/*
* In-memory inode data.
@@ -182,8 +184,15 @@ struct zonefs_sb_info {
loff_t s_blocks;
loff_t s_used_blocks;
- unsigned int s_max_open_zones;
- atomic_t s_open_zones;
+ unsigned int s_max_wro_seq_files;
+ atomic_t s_wro_seq_files;
+
+ unsigned int s_max_active_seq_files;
+ atomic_t s_active_seq_files;
+
+ bool s_sysfs_registered;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
};
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
@@ -198,4 +207,9 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
#define zonefs_warn(sb, format, args...) \
pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
+int zonefs_sysfs_register(struct super_block *sb);
+void zonefs_sysfs_unregister(struct super_block *sb);
+int zonefs_sysfs_init(void);
+void zonefs_sysfs_exit(void);
+
#endif