aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/compression.c7
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/extent-tree.c27
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c4
-rw-r--r--fs/btrfs/inode.c18
-rw-r--r--fs/btrfs/ioctl.c26
-rw-r--r--fs/btrfs/qgroup.c4
-rw-r--r--fs/btrfs/relocation.c1
-rw-r--r--fs/btrfs/send.c6
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c4
-rw-r--r--fs/btrfs/tests/qgroup-tests.c4
-rw-r--r--fs/btrfs/tree-checker.c20
-rw-r--r--fs/btrfs/tree-log.c52
-rw-r--r--fs/btrfs/uuid-tree.c2
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/readdir.c63
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ext4/block_validity.c6
-rw-r--r--fs/ext4/dir.c6
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/namei.c36
-rw-r--r--fs/ext4/super.c143
-rw-r--r--fs/file.c7
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c7
-rw-r--r--fs/io-wq.c12
-rw-r--r--fs/io-wq.h8
-rw-r--r--fs/io_uring.c1170
-rw-r--r--fs/locks.c2
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/notify/fsnotify.c4
-rw-r--r--fs/nsfs.c3
-rw-r--r--fs/ocfs2/dlmglue.c1
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/posix_acl.c7
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/pstore/ram.c13
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/quota/dquot.c1
-rw-r--r--fs/super.c4
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c18
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c5
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c21
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h29
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c6
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c64
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h1
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c96
-rw-r--r--fs/xfs/xfs_bmap_util.c12
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_mount.c168
-rw-r--r--fs/xfs/xfs_trace.h21
60 files changed, 1427 insertions, 742 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee834ef7beb4..43e1660f450f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -447,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
if (blkcg_css) {
bio->bi_opf |= REQ_CGROUP_PUNT;
- bio_associate_blkg_from_css(bio, blkcg_css);
+ kthread_associate_blkcg(blkcg_css);
}
refcount_set(&cb->pending_bios, 1);
@@ -491,6 +491,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
@@ -517,6 +519,9 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_endio(bio);
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
+
return 0;
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5b6e86aaf2e1..24658b5a5787 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -379,7 +379,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = rb_entry(node, struct tree_mod_elem, node);
- if (tm->seq > min_seq)
+ if (tm->seq >= min_seq)
continue;
rb_erase(node, tm_root);
kfree(tm);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b2e8fd8a8e59..54efb21c2727 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2787,7 +2787,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
/* file-item.c */
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
+ struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u8 *dst);
blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 153f71a5bba9..274318e9114e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1869,8 +1869,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
btrfs_pin_extent(fs_info, head->bytenr,
head->num_bytes, 1);
if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info, head->bytenr,
- head->num_bytes);
+ ret = btrfs_del_csums(trans, fs_info->csum_root,
+ head->bytenr, head->num_bytes);
}
}
@@ -3175,7 +3175,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (is_data) {
- ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
+ ret = btrfs_del_csums(trans, info->csum_root, bytenr,
+ num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3799,6 +3800,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 flags, int delalloc)
{
int ret = 0;
+ int cache_block_group_error = 0;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
@@ -3958,7 +3960,20 @@ have_block_group:
if (unlikely(!ffe_ctl.cached)) {
ffe_ctl.have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
- BUG_ON(ret < 0);
+
+ /*
+ * If we get ENOMEM here or something else we want to
+ * try other block groups, because it may not be fatal.
+ * However if we can't find anything else we need to
+ * save our return here so that we return the actual
+ * error that caused problems, not ENOSPC.
+ */
+ if (ret < 0) {
+ if (!cache_block_group_error)
+ cache_block_group_error = ret;
+ ret = 0;
+ goto loop;
+ }
ret = 0;
}
@@ -4045,7 +4060,7 @@ loop:
if (ret > 0)
goto search;
- if (ret == -ENOSPC) {
+ if (ret == -ENOSPC && !cache_block_group_error) {
/*
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
@@ -4056,6 +4071,8 @@ loop:
space_info->max_extent_size = ffe_ctl.max_extent_size;
spin_unlock(&space_info->lock);
ins->offset = ffe_ctl.max_extent_size;
+ } else if (ret == -ENOSPC) {
+ ret = cache_block_group_error;
}
return ret;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb8bd0258360..2f4802f405a2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5074,12 +5074,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> PAGE_SHIFT, eb);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3270a40b0777..b1bfdc5c1387 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -590,9 +590,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
* range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr, u64 len)
+ struct btrfs_root *root, u64 bytenr, u64 len)
{
- struct btrfs_root *root = fs_info->csum_root;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
u64 end_byte = bytenr + len;
@@ -602,6 +602,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int blocksize_bits = fs_info->sb->s_blocksize_bits;
+ ASSERT(root == fs_info->csum_root ||
+ root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0cb43b682789..8d47c76b7bd1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2599,8 +2599,8 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info) {
- u64 clone_len = drop_end - cur_offset;
+ if (clone_info && drop_end > clone_info->file_offset) {
+ u64 clone_len = drop_end - clone_info->file_offset;
ret = btrfs_insert_clone_extent(trans, inode, path,
clone_info, clone_len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 56032c518b26..5509c41a4f43 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1479,10 +1479,10 @@ next_slot:
disk_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
/*
- * If extent we got ends before our range starts, skip
- * to next extent
+ * If the extent we got ends before our current offset,
+ * skip to the next extent.
*/
- if (extent_end <= start) {
+ if (extent_end <= cur_offset) {
path->slots[0]++;
goto next_slot;
}
@@ -5728,7 +5728,6 @@ static void inode_tree_add(struct inode *inode)
static void inode_tree_del(struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int empty = 0;
@@ -5741,7 +5740,6 @@ static void inode_tree_del(struct inode *inode)
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- synchronize_srcu(&fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
spin_unlock(&root->inode_lock);
@@ -9556,9 +9554,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
btrfs_init_log_ctx(&ctx_dest, new_inode);
/* close the race window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- down_read(&fs_info->subvol_sem);
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
/*
@@ -9792,9 +9789,8 @@ out_fail:
ret = ret ? ret : ret2;
}
out_notrans:
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
- up_read(&fs_info->subvol_sem);
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
ASSERT(list_empty(&ctx_root.list));
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a1ee0b775e65..18e328ce4b54 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -704,11 +704,17 @@ static noinline int create_subvol(struct inode *dir,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
btrfs_ino(BTRFS_I(dir)), index, name, namelen);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
@@ -3720,24 +3726,18 @@ process_slot:
ret = 0;
if (last_dest_end < destoff + len) {
- struct btrfs_clone_extent_info clone_info = { 0 };
/*
- * We have an implicit hole (NO_HOLES feature is enabled) that
- * fully or partially overlaps our cloning range at its end.
+ * We have an implicit hole that fully or partially overlaps our
+ * cloning range at its end. This means that we either have the
+ * NO_HOLES feature enabled or the implicit hole happened due to
+ * mixing buffered and direct IO writes against this file.
*/
btrfs_release_path(path);
path->leave_spinning = 0;
- /*
- * We are dealing with a hole and our clone_info already has a
- * disk_offset of 0, we only need to fill the data length and
- * file offset.
- */
- clone_info.data_len = destoff + len - last_dest_end;
- clone_info.file_offset = last_dest_end;
ret = btrfs_punch_hole_range(inode, path,
last_dest_end, destoff + len - 1,
- &clone_info, &trans);
+ NULL, &trans);
if (ret)
goto out;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93aeb2e539a4..d4282e12f2a6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3232,12 +3232,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
btrfs_warn(fs_info,
- "qgroup rescan init failed, qgroup is not enabled");
+ "qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_warn(fs_info,
- "qgroup rescan init failed, qgroup rescan is not queued");
+ "qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d897a8e5e430..c58245797f30 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4552,6 +4552,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
goto out_free;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ae2db5eb1549..091e5bc8c7ea 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -7084,12 +7084,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
spin_unlock(&send_root->root_item_lock);
/*
- * This is done when we lookup the root, it should already be complete
- * by the time we get here.
- */
- WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
-
- /*
* Userspace tools do the checks and warn the user if it's
* not RO.
*/
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 1a846bf6e197..914eea5ba6a7 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -452,9 +452,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
root->fs_info->tree_root = root;
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
- if (!root->node) {
+ if (IS_ERR(root->node)) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
- ret = -ENOMEM;
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 09aaca1efd62..ac035a6fa003 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
* *cough*backref walking code*cough*
*/
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
- if (!root->node) {
+ if (IS_ERR(root->node)) {
test_err("couldn't allocate dummy buffer");
- ret = -ENOMEM;
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 493d4d9e0f79..97f3520b8d98 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -227,7 +227,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
*/
if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) {
file_extent_err(leaf, slot,
- "invalid item size, have %u expect [%lu, %u)",
+ "invalid item size, have %u expect [%zu, %u)",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
SZ_4K);
return -EUCLEAN;
@@ -332,7 +332,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
- int slot)
+ int slot, struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
u32 sectorsize = fs_info->sectorsize;
@@ -356,6 +356,20 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
btrfs_item_size_nr(leaf, slot), csumsize);
return -EUCLEAN;
}
+ if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
+ u64 prev_csum_end;
+ u32 prev_item_size;
+
+ prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
+ prev_csum_end = (prev_item_size / csumsize) * sectorsize;
+ prev_csum_end += prev_key->offset;
+ if (prev_csum_end > key->offset) {
+ generic_err(leaf, slot - 1,
+"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
+ prev_csum_end, key->offset);
+ return -EUCLEAN;
+ }
+ }
return 0;
}
@@ -1355,7 +1369,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
ret = check_extent_data_item(leaf, key, slot, prev_key);
break;
case BTRFS_EXTENT_CSUM_KEY:
- ret = check_csum_item(leaf, key, slot);
+ ret = check_csum_item(leaf, key, slot, prev_key);
break;
case BTRFS_DIR_ITEM_KEY:
case BTRFS_DIR_INDEX_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6f757361db53..d3f115909ff0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -808,7 +808,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_del_csums(trans, fs_info,
+ ret = btrfs_del_csums(trans,
+ fs_info->csum_root,
sums->bytenr,
sums->len);
if (!ret)
@@ -3909,6 +3910,28 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return 0;
}
+static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_ordered_sum *sums)
+{
+ int ret;
+
+ /*
+ * Due to extent cloning, we might have logged a csum item that covers a
+ * subrange of a cloned extent, and later we can end up logging a csum
+ * item for a larger subrange of the same extent or the entire range.
+ * This would leave csum items in the log tree that cover the same range
+ * and break the searches for checksums in the log tree, resulting in
+ * some checksums missing in the fs/subvolume tree. So just delete (or
+ * trim and adjust) any existing csum items in the log for this range.
+ */
+ ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
+ if (ret)
+ return ret;
+
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+}
+
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
@@ -4054,7 +4077,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
+ ret = log_csums(trans, log, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4274,7 +4297,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log_root, sums);
+ ret = log_csums(trans, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -6294,9 +6317,28 @@ again:
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+
+ /*
+ * We didn't find the subvol, likely because it was
+ * deleted. This is ok, simply skip this log and go to
+ * the next one.
+ *
+ * We need to exclude the root because we can't have
+ * other log replays overwriting this log as we'll read
+ * it back in a few more times. This will keep our
+ * block from being modified, and we'll just bail for
+ * each subsequent pass.
+ */
+ if (ret == -ENOENT)
+ ret = btrfs_pin_extent_for_log_replay(fs_info,
+ log->node->start,
+ log->node->len);
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
+
+ if (!ret)
+ goto next;
btrfs_handle_fs_error(fs_info, ret,
"Couldn't read target root for tree log recovery.");
goto error;
@@ -6328,7 +6370,6 @@ again:
&root->highest_objectid);
}
- key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
@@ -6336,9 +6377,10 @@ again:
if (ret)
goto error;
-
+next:
if (found_key.offset == 0)
break;
+ key.offset = found_key.offset - 1;
}
btrfs_release_path(path);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 91caab63bdf5..76b84f2397b1 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -324,6 +324,8 @@ again_search_slot:
}
if (ret < 0 && ret != -ENOENT)
goto out;
+ key.offset++;
+ goto again_search_slot;
}
item_size -= sizeof(subid_le);
offset += sizeof(subid_le);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d8e5560db285..a6d3f08bfff3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -61,7 +61,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID1C3] = {
.sub_stripes = 1,
.dev_stripes = 1,
- .devs_max = 0,
+ .devs_max = 3,
.devs_min = 3,
.tolerated_failures = 2,
.devs_increment = 3,
@@ -73,7 +73,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID1C4] = {
.sub_stripes = 1,
.dev_stripes = 1,
- .devs_max = 0,
+ .devs_max = 4,
.devs_min = 4,
.tolerated_failures = 3,
.devs_increment = 4,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ce9bac756c2a..40705e862451 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1693,6 +1693,7 @@ struct cifs_fattr {
struct timespec64 cf_atime;
struct timespec64 cf_mtime;
struct timespec64 cf_ctime;
+ u32 cf_cifstag;
};
static inline void free_dfs_info_param(struct dfs_info3_param *param)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 3925a7bfc74d..d17587c2c4ab 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -139,6 +139,28 @@ retry:
dput(dentry);
}
+static bool reparse_file_needs_reval(const struct cifs_fattr *fattr)
+{
+ if (!(fattr->cf_cifsattrs & ATTR_REPARSE))
+ return false;
+ /*
+ * The DFS tags should be only intepreted by server side as per
+ * MS-FSCC 2.1.2.1, but let's include them anyway.
+ *
+ * Besides, if cf_cifstag is unset (0), then we still need it to be
+ * revalidated to know exactly what reparse point it is.
+ */
+ switch (fattr->cf_cifstag) {
+ case IO_REPARSE_TAG_DFS:
+ case IO_REPARSE_TAG_DFSR:
+ case IO_REPARSE_TAG_SYMLINK:
+ case IO_REPARSE_TAG_NFS:
+ case 0:
+ return true;
+ }
+ return false;
+}
+
static void
cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
{
@@ -158,7 +180,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
* is a symbolic link, DFS referral or a reparse point with a direct
* access like junctions, deduplicated files, NFS symlinks.
*/
- if (fattr->cf_cifsattrs & ATTR_REPARSE)
+ if (reparse_file_needs_reval(fattr))
fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
/* non-unix readdir doesn't provide nlink */
@@ -194,19 +216,37 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
}
}
+static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info)
+{
+ const FILE_DIRECTORY_INFO *fi = info;
+
+ memset(fattr, 0, sizeof(*fattr));
+ fattr->cf_cifsattrs = le32_to_cpu(fi->ExtFileAttributes);
+ fattr->cf_eof = le64_to_cpu(fi->EndOfFile);
+ fattr->cf_bytes = le64_to_cpu(fi->AllocationSize);
+ fattr->cf_createtime = le64_to_cpu(fi->CreationTime);
+ fattr->cf_atime = cifs_NTtimeToUnix(fi->LastAccessTime);
+ fattr->cf_ctime = cifs_NTtimeToUnix(fi->ChangeTime);
+ fattr->cf_mtime = cifs_NTtimeToUnix(fi->LastWriteTime);
+}
+
void
cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
struct cifs_sb_info *cifs_sb)
{
- memset(fattr, 0, sizeof(*fattr));
- fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes);
- fattr->cf_eof = le64_to_cpu(info->EndOfFile);
- fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
- fattr->cf_createtime = le64_to_cpu(info->CreationTime);
- fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
- fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
- fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+ __dir_info_to_fattr(fattr, info);
+ cifs_fill_common_info(fattr, cifs_sb);
+}
+static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr,
+ SEARCH_ID_FULL_DIR_INFO *info,
+ struct cifs_sb_info *cifs_sb)
+{
+ __dir_info_to_fattr(fattr, info);
+
+ /* See MS-FSCC 2.4.18 FileIdFullDirectoryInformation */
+ if (fattr->cf_cifsattrs & ATTR_REPARSE)
+ fattr->cf_cifstag = le32_to_cpu(info->EaSize);
cifs_fill_common_info(fattr, cifs_sb);
}
@@ -755,6 +795,11 @@ static int cifs_filldir(char *find_entry, struct file *file,
(FIND_FILE_STANDARD_INFO *)find_entry,
cifs_sb);
break;
+ case SMB_FIND_FILE_ID_FULL_DIR_INFO:
+ cifs_fulldir_info_to_fattr(&fattr,
+ (SEARCH_ID_FULL_DIR_INFO *)find_entry,
+ cifs_sb);
+ break;
default:
cifs_dir_info_to_fattr(&fattr,
(FILE_DIRECTORY_INFO *)find_entry,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 8b0b512c5792..afe1f03aabe3 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -67,7 +67,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
goto out;
- if (oparms->tcon->use_resilient) {
+ if (oparms->tcon->use_resilient) {
/* default timeout is 0, servers pick default (120 seconds) */
nr_ioctl_req.Timeout =
cpu_to_le32(oparms->tcon->handle_timeout);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0ec4f270139f..00b4d15bb811 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,6 +39,8 @@
#include <linux/atomic.h>
#include <linux/prefetch.h>
+#include "internal.h"
+
/*
* How many user pages to map in one call to get_user_pages(). This determines
* the size of a structure in the slab cache
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d31b6c72b476..dc1a1d5d825b 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -35,11 +35,11 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
+ cond_resched();
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index d4d4fdfac1a6..1ee04e76bbe0 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -133,10 +133,13 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
{
struct rb_node *node;
struct ext4_system_zone *entry;
+ struct ext4_system_blocks *system_blks;
int first = 1;
printk(KERN_INFO "System zones: ");
- node = rb_first(&sbi->system_blks->root);
+ rcu_read_lock();
+ system_blks = rcu_dereference(sbi->system_blks);
+ node = rb_first(&system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -144,6 +147,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
first = 0;
node = rb_next(node);
}
+ rcu_read_unlock();
printk(KERN_CONT "\n");
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9fdd2b269d61..9f00fc0bf21d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -72,6 +72,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
const char *error_msg = NULL;
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
+ const int next_offset = ((char *) de - buf) + rlen;
if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
@@ -79,8 +80,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
error_msg = "rec_len % 4 != 0";
else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (unlikely(((char *) de - buf) + rlen > size))
+ else if (unlikely(next_offset > size))
error_msg = "directory entry overrun";
+ else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
+ next_offset != size))
+ error_msg = "directory entry too close to block end";
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index dc333e8e51e8..8ca4a23129aa 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -921,8 +921,8 @@ repeat_in_this_group:
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks,
- 0, 0);
+ handle_type, nblocks, 0,
+ ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f28de0c1b6..629a25d999f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5692,7 +5692,7 @@ int ext4_expand_extra_isize(struct inode *inode,
error = ext4_journal_get_write_access(handle, iloc->bh);
if (error) {
brelse(iloc->bh);
- goto out_stop;
+ goto out_unlock;
}
error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
@@ -5702,8 +5702,8 @@ int ext4_expand_extra_isize(struct inode *inode,
if (!error)
error = rc;
+out_unlock:
ext4_write_unlock_xattr(inode, &no_expand);
-out_stop:
ext4_journal_stop(handle);
return error;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a856997d87b5..1cb42d940784 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2164,7 +2164,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
+#ifdef CONFIG_UNICODE
struct ext4_sb_info *sbi;
+#endif
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@@ -2176,12 +2178,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
- sbi = EXT4_SB(sb);
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
#ifdef CONFIG_UNICODE
+ sbi = EXT4_SB(sb);
if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
return -EINVAL;
@@ -2822,7 +2824,7 @@ bool ext4_empty_dir(struct inode *inode)
{
unsigned int offset;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de, *de1;
+ struct ext4_dir_entry_2 *de;
struct super_block *sb;
if (ext4_has_inline_data(inode)) {
@@ -2847,19 +2849,25 @@ bool ext4_empty_dir(struct inode *inode)
return true;
de = (struct ext4_dir_entry_2 *) bh->b_data;
- de1 = ext4_next_entry(de, sb->s_blocksize);
- if (le32_to_cpu(de->inode) != inode->i_ino ||
- le32_to_cpu(de1->inode) == 0 ||
- strcmp(".", de->name) || strcmp("..", de1->name)) {
- ext4_warning_inode(inode, "directory missing '.' and/or '..'");
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
+ 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
+ ext4_warning_inode(inode, "directory missing '.'");
+ brelse(bh);
+ return true;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de, sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
+ offset) ||
+ le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
return true;
}
- offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
- ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
- de = ext4_next_entry(de1, sb->s_blocksize);
+ offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
- if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ if (!(offset & (sb->s_blocksize - 1))) {
unsigned int lblock;
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -2870,12 +2878,11 @@ bool ext4_empty_dir(struct inode *inode)
}
if (IS_ERR(bh))
return true;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
}
+ de = (struct ext4_dir_entry_2 *) (bh->b_data +
+ (offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
bh->b_data, bh->b_size, offset)) {
- de = (struct ext4_dir_entry_2 *)(bh->b_data +
- sb->s_blocksize);
offset = (offset | (sb->s_blocksize - 1)) + 1;
continue;
}
@@ -2884,7 +2891,6 @@ bool ext4_empty_dir(struct inode *inode)
return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
- de = ext4_next_entry(de, sb->s_blocksize);
}
brelse(bh);
return true;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1d82b56d9b11..2937a8873fe1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1900,6 +1900,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_commit_interval = HZ * arg;
} else if (token == Opt_debug_want_extra_isize) {
+ if ((arg & 1) ||
+ (arg < 4) ||
+ (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid want_extra_isize %d", arg);
+ return -1;
+ }
sbi->s_want_extra_isize = arg;
} else if (token == Opt_max_batch_time) {
sbi->s_max_batch_time = arg;
@@ -3554,40 +3561,6 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
-static void ext4_clamp_want_extra_isize(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
- unsigned def_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
-
- if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
- sbi->s_want_extra_isize = 0;
- return;
- }
- if (sbi->s_want_extra_isize < 4) {
- sbi->s_want_extra_isize = def_extra_isize;
- if (ext4_has_feature_extra_isize(sb)) {
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_want_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_want_extra_isize);
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_min_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_min_extra_isize);
- }
- }
- /* Check if enough inode space is available */
- if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
- (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
- sbi->s_inode_size)) {
- sbi->s_want_extra_isize = def_extra_isize;
- ext4_msg(sb, KERN_INFO,
- "required extra inode space not available");
- }
-}
-
static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
@@ -3795,6 +3768,68 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+ sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+ sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+ } else {
+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+ if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
+ ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
+ sbi->s_first_ino);
+ goto failed_mount;
+ }
+ if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+ (!is_power_of_2(sbi->s_inode_size)) ||
+ (sbi->s_inode_size > blocksize)) {
+ ext4_msg(sb, KERN_ERR,
+ "unsupported inode size: %d",
+ sbi->s_inode_size);
+ goto failed_mount;
+ }
+ /*
+ * i_atime_extra is the last extra field available for
+ * [acm]times in struct ext4_inode. Checking for that
+ * field should suffice to ensure we have extra space
+ * for all three.
+ */
+ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
+ sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
+ sb->s_time_gran = 1;
+ sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
+ } else {
+ sb->s_time_gran = NSEC_PER_SEC;
+ sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
+ }
+ sb->s_time_min = EXT4_TIMESTAMP_MIN;
+ }
+ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+ if (ext4_has_feature_extra_isize(sb)) {
+ unsigned v, max = (sbi->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE);
+
+ v = le16_to_cpu(es->s_want_extra_isize);
+ if (v > max) {
+ ext4_msg(sb, KERN_ERR,
+ "bad s_want_extra_isize: %d", v);
+ goto failed_mount;
+ }
+ if (sbi->s_want_extra_isize < v)
+ sbi->s_want_extra_isize = v;
+
+ v = le16_to_cpu(es->s_min_extra_isize);
+ if (v > max) {
+ ext4_msg(sb, KERN_ERR,
+ "bad s_min_extra_isize: %d", v);
+ goto failed_mount;
+ }
+ if (sbi->s_want_extra_isize < v)
+ sbi->s_want_extra_isize = v;
+ }
+ }
+
if (sbi->s_es->s_mount_opts[0]) {
char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
sizeof(sbi->s_es->s_mount_opts),
@@ -4033,42 +4068,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
- if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
- sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
- sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
- } else {
- sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
- sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
- if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
- ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
- sbi->s_first_ino);
- goto failed_mount;
- }
- if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
- (!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
- ext4_msg(sb, KERN_ERR,
- "unsupported inode size: %d",
- sbi->s_inode_size);
- goto failed_mount;
- }
- /*
- * i_atime_extra is the last extra field available for [acm]times in
- * struct ext4_inode. Checking for that field should suffice to ensure
- * we have extra space for all three.
- */
- if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
- sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
- sb->s_time_gran = 1;
- sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
- } else {
- sb->s_time_gran = NSEC_PER_SEC;
- sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
- }
-
- sb->s_time_min = EXT4_TIMESTAMP_MIN;
- }
-
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
if (ext4_has_feature_64bit(sb)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
@@ -4517,8 +4516,6 @@ no_journal:
} else if (ret)
goto failed_mount4a;
- ext4_clamp_want_extra_isize(sb);
-
ext4_set_resv_clusters(sb);
err = ext4_setup_system_zone(sb);
@@ -5306,8 +5303,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- ext4_clamp_want_extra_isize(sb);
-
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
diff --git a/fs/file.c b/fs/file.c
index 2f4fcf985079..3da91a112bab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -960,7 +960,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
return ksys_dup3(oldfd, newfd, 0);
}
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
+int ksys_dup(unsigned int fildes)
{
int ret = -EBADF;
struct file *file = fget_raw(fildes);
@@ -975,6 +975,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
return ret;
}
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
+{
+ return ksys_dup(fildes);
+}
+
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
int err;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d5c2a3158610..a66e425884d1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1498,8 +1498,10 @@ static int __init init_hugetlbfs_fs(void)
/* other hstates are optional */
i = 0;
for_each_hstate(h) {
- if (i == default_hstate_idx)
+ if (i == default_hstate_idx) {
+ i++;
continue;
+ }
mnt = mount_one_hugetlbfs(h);
if (IS_ERR(mnt))
diff --git a/fs/inode.c b/fs/inode.c
index fef457a42882..96d62d97694e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -676,6 +676,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
struct inode *inode, *next;
LIST_HEAD(dispose);
+again:
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
@@ -698,6 +699,12 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
+ if (need_resched()) {
+ spin_unlock(&sb->s_inode_list_lock);
+ cond_resched();
+ dispose_list(&dispose);
+ goto again;
+ }
}
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 90c4978781fb..541c8a3e0bbb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -92,7 +92,6 @@ struct io_wqe {
struct io_wqe_acct acct[2];
struct hlist_nulls_head free_list;
- struct hlist_nulls_head busy_list;
struct list_head all_list;
struct io_wq *wq;
@@ -327,7 +326,6 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
hlist_nulls_del_init_rcu(&worker->nulls_node);
- hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list);
}
/*
@@ -365,7 +363,6 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
{
if (!(worker->flags & IO_WORKER_F_FREE)) {
worker->flags |= IO_WORKER_F_FREE;
- hlist_nulls_del_init_rcu(&worker->nulls_node);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
}
@@ -432,6 +429,8 @@ next:
if (signal_pending(current))
flush_signals(current);
+ cond_resched();
+
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
@@ -798,10 +797,6 @@ void io_wq_cancel_all(struct io_wq *wq)
set_bit(IO_WQ_BIT_CANCEL, &wq->state);
- /*
- * Browse both lists, as there's a gap between handing work off
- * to a worker and the worker putting itself on the busy_list
- */
rcu_read_lock();
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
@@ -948,7 +943,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
/*
* Now check if a free (going busy) or busy worker has the work
* currently running. If we find it there, we'll return CANCEL_RUNNING
- * as an indication that we attempte to signal cancellation. The
+ * as an indication that we attempt to signal cancellation. The
* completion will run normally in this case.
*/
rcu_read_lock();
@@ -1049,7 +1044,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
- INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
INIT_LIST_HEAD(&wqe->all_list);
}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index fb993b2bd0ef..3f5e356de980 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -120,6 +120,10 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk)
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
-#endif /* CONFIG_IO_WQ */
+#endif
-#endif /* INTERNAL_IO_WQ_H */
+static inline bool io_wq_current_is_worker(void)
+{
+ return in_task() && (current->flags & PF_IO_WORKER);
+}
+#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9b1833fedc5c..38b54051facd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -289,7 +289,10 @@ struct io_ring_ctx {
*/
struct io_poll_iocb {
struct file *file;
- struct wait_queue_head *head;
+ union {
+ struct wait_queue_head *head;
+ u64 addr;
+ };
__poll_t events;
bool done;
bool canceled;
@@ -304,6 +307,51 @@ struct io_timeout_data {
u32 seq_offset;
};
+struct io_accept {
+ struct file *file;
+ struct sockaddr __user *addr;
+ int __user *addr_len;
+ int flags;
+};
+
+struct io_sync {
+ struct file *file;
+ loff_t len;
+ loff_t off;
+ int flags;
+};
+
+struct io_cancel {
+ struct file *file;
+ u64 addr;
+};
+
+struct io_timeout {
+ struct file *file;
+ u64 addr;
+ int flags;
+ unsigned count;
+};
+
+struct io_rw {
+ /* NOTE: kiocb has the file as the first member, so don't do it here */
+ struct kiocb kiocb;
+ u64 addr;
+ u64 len;
+};
+
+struct io_connect {
+ struct file *file;
+ struct sockaddr __user *addr;
+ int addr_len;
+};
+
+struct io_sr_msg {
+ struct file *file;
+ struct user_msghdr __user *msg;
+ int msg_flags;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -323,7 +371,6 @@ struct io_async_rw {
};
struct io_async_ctx {
- struct io_uring_sqe sqe;
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
@@ -341,17 +388,23 @@ struct io_async_ctx {
struct io_kiocb {
union {
struct file *file;
- struct kiocb rw;
+ struct io_rw rw;
struct io_poll_iocb poll;
+ struct io_accept accept;
+ struct io_sync sync;
+ struct io_cancel cancel;
+ struct io_timeout timeout;
+ struct io_connect connect;
+ struct io_sr_msg sr_msg;
};
- const struct io_uring_sqe *sqe;
struct io_async_ctx *io;
struct file *ring_file;
int ring_fd;
bool has_user;
bool in_async;
bool needs_fixed_file;
+ u8 opcode;
struct io_ring_ctx *ctx;
union {
@@ -564,12 +617,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
+static inline bool io_req_needs_user(struct io_kiocb *req)
{
- u8 opcode = READ_ONCE(sqe->opcode);
-
- return !(opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED);
+ return !(req->opcode == IORING_OP_READ_FIXED ||
+ req->opcode == IORING_OP_WRITE_FIXED);
}
static inline bool io_prep_async_work(struct io_kiocb *req,
@@ -577,33 +628,31 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
{
bool do_hashed = false;
- if (req->sqe) {
- switch (req->sqe->opcode) {
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- /* only regular files should be hashed for writes */
- if (req->flags & REQ_F_ISREG)
- do_hashed = true;
- /* fall-through */
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_SENDMSG:
- case IORING_OP_RECVMSG:
- case IORING_OP_ACCEPT:
- case IORING_OP_POLL_ADD:
- case IORING_OP_CONNECT:
- /*
- * We know REQ_F_ISREG is not set on some of these
- * opcodes, but this enables us to keep the check in
- * just one place.
- */
- if (!(req->flags & REQ_F_ISREG))
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- break;
- }
- if (io_sqe_needs_user(req->sqe))
- req->work.flags |= IO_WQ_WORK_NEEDS_USER;
+ switch (req->opcode) {
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ /* only regular files should be hashed for writes */
+ if (req->flags & REQ_F_ISREG)
+ do_hashed = true;
+ /* fall-through */
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_SENDMSG:
+ case IORING_OP_RECVMSG:
+ case IORING_OP_ACCEPT:
+ case IORING_OP_POLL_ADD:
+ case IORING_OP_CONNECT:
+ /*
+ * We know REQ_F_ISREG is not set on some of these
+ * opcodes, but this enables us to keep the check in
+ * just one place.
+ */
+ if (!(req->flags & REQ_F_ISREG))
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
+ break;
}
+ if (io_req_needs_user(req))
+ req->work.flags |= IO_WQ_WORK_NEEDS_USER;
*link = io_prep_linked_timeout(req);
return do_hashed;
@@ -972,7 +1021,7 @@ static void io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- link->sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+ link->opcode == IORING_OP_LINK_TIMEOUT) {
io_link_cancel_timeout(link);
} else {
io_cqring_fill_event(link, -ECANCELED);
@@ -1148,7 +1197,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
ret = 0;
list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
- struct kiocb *kiocb = &req->rw;
+ struct kiocb *kiocb = &req->rw.kiocb;
/*
* Move completed entries to our local list. If we find a
@@ -1178,7 +1227,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
/*
- * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
+ * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
* non-spinning poll check - we'll still enter the driver poll loop, but only
* as a non-spinning completion check.
*/
@@ -1303,7 +1352,7 @@ static inline void req_set_fail_links(struct io_kiocb *req)
static void io_complete_rw_common(struct kiocb *kiocb, long res)
{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
@@ -1315,7 +1364,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
io_complete_rw_common(kiocb, res);
io_put_req(req);
@@ -1323,7 +1372,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_kiocb *nxt = NULL;
io_complete_rw_common(kiocb, res);
@@ -1334,7 +1383,7 @@ static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
@@ -1368,7 +1417,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
list);
- if (list_req->rw.ki_filp != req->rw.ki_filp)
+ if (list_req->file != req->file)
ctx->poll_multi_file = true;
}
@@ -1439,11 +1488,11 @@ static bool io_file_supports_async(struct file *file)
return false;
}
-static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
{
- const struct io_uring_sqe *sqe = req->sqe;
struct io_ring_ctx *ctx = req->ctx;
- struct kiocb *kiocb = &req->rw;
+ struct kiocb *kiocb = &req->rw.kiocb;
unsigned ioprio;
int ret;
@@ -1492,6 +1541,12 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
return -EINVAL;
kiocb->ki_complete = io_complete_rw;
}
+
+ req->rw.addr = READ_ONCE(sqe->addr);
+ req->rw.len = READ_ONCE(sqe->len);
+ /* we own ->private, reuse it for the buffer index */
+ req->rw.kiocb.private = (void *) (unsigned long)
+ READ_ONCE(sqe->buf_index);
return 0;
}
@@ -1525,11 +1580,11 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
io_rw_done(kiocb, ret);
}
-static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
- const struct io_uring_sqe *sqe,
+static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
struct iov_iter *iter)
{
- size_t len = READ_ONCE(sqe->len);
+ struct io_ring_ctx *ctx = req->ctx;
+ size_t len = req->rw.len;
struct io_mapped_ubuf *imu;
unsigned index, buf_index;
size_t offset;
@@ -1539,13 +1594,13 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
if (unlikely(!ctx->user_bufs))
return -EFAULT;
- buf_index = READ_ONCE(sqe->buf_index);
+ buf_index = (unsigned long) req->rw.kiocb.private;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = &ctx->user_bufs[index];
- buf_addr = READ_ONCE(sqe->addr);
+ buf_addr = req->rw.addr;
/* overflow */
if (buf_addr + len < buf_addr)
@@ -1602,25 +1657,20 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter)
{
- const struct io_uring_sqe *sqe = req->sqe;
- void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- size_t sqe_len = READ_ONCE(sqe->len);
+ void __user *buf = u64_to_user_ptr(req->rw.addr);
+ size_t sqe_len = req->rw.len;
u8 opcode;
- /*
- * We're reading ->opcode for the second time, but the first read
- * doesn't care whether it's _FIXED or not, so it doesn't matter
- * whether ->opcode changes concurrently. The first read does care
- * about whether it is a READ or a WRITE, so we don't trust this read
- * for that purpose and instead let the caller pass in the read/write
- * flag.
- */
- opcode = READ_ONCE(sqe->opcode);
+ opcode = req->opcode;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
- return io_import_fixed(req->ctx, rw, sqe, iter);
+ return io_import_fixed(req, rw, iter);
}
+ /* buffer index only valid with fixed read/write */
+ if (req->rw.kiocb.private)
+ return -EINVAL;
+
if (req->io) {
struct io_async_rw *iorw = &req->io->rw;
@@ -1701,7 +1751,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret;
}
-static void io_req_map_io(struct io_kiocb *req, ssize_t io_size,
+static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
@@ -1715,57 +1765,81 @@ static void io_req_map_io(struct io_kiocb *req, ssize_t io_size,
}
}
-static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size,
+static int io_alloc_async_ctx(struct io_kiocb *req)
+{
+ req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
+ return req->io == NULL;
+}
+
+static void io_rw_async(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct iovec *iov = NULL;
+
+ if (req->io->rw.iov != req->io->rw.fast_iov)
+ iov = req->io->rw.iov;
+ io_wq_submit_work(workptr);
+ kfree(iov);
+}
+
+static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
- req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
- if (req->io) {
- io_req_map_io(req, io_size, iovec, fast_iov, iter);
- memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe));
- req->sqe = &req->io->sqe;
- return 0;
- }
+ if (!req->io && io_alloc_async_ctx(req))
+ return -ENOMEM;
- return -ENOMEM;
+ io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+ req->work.func = io_rw_async;
+ return 0;
}
-static int io_read_prep(struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool force_nonblock)
+static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
{
+ struct io_async_ctx *io;
+ struct iov_iter iter;
ssize_t ret;
- ret = io_prep_rw(req, force_nonblock);
+ ret = io_prep_rw(req, sqe, force_nonblock);
if (ret)
return ret;
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
- return io_import_iovec(READ, req, iovec, iter);
+ if (!req->io)
+ return 0;
+
+ io = req->io;
+ io->rw.iov = io->rw.fast_iov;
+ req->io = NULL;
+ ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
+ req->io = io;
+ if (ret < 0)
+ return ret;
+
+ io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+ return 0;
}
static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
- struct kiocb *kiocb = &req->rw;
+ struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter iter;
- struct file *file;
size_t iov_count;
ssize_t io_size, ret;
- if (!req->io) {
- ret = io_read_prep(req, &iovec, &iter, force_nonblock);
- if (ret < 0)
- return ret;
- } else {
- ret = io_import_iovec(READ, req, &iovec, &iter);
- if (ret < 0)
- return ret;
- }
+ ret = io_import_iovec(READ, req, &iovec, &iter);
+ if (ret < 0)
+ return ret;
+
+ /* Ensure we clear previously set non-block flag */
+ if (!force_nonblock)
+ req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
- file = req->file;
io_size = ret;
if (req->flags & REQ_F_LINK)
req->result = io_size;
@@ -1774,39 +1848,27 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(file)) {
+ if (force_nonblock && !io_file_supports_async(req->file)) {
req->flags |= REQ_F_MUST_PUNT;
goto copy_iov;
}
iov_count = iov_iter_count(&iter);
- ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
+ ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
if (!ret) {
ssize_t ret2;
- if (file->f_op->read_iter)
- ret2 = call_read_iter(file, kiocb, &iter);
+ if (req->file->f_op->read_iter)
+ ret2 = call_read_iter(req->file, kiocb, &iter);
else
- ret2 = loop_rw_iter(READ, file, kiocb, &iter);
+ ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
- /*
- * In case of a short read, punt to async. This can happen
- * if we have data partially cached. Alternatively we can
- * return the short read, in which case the application will
- * need to issue another SQE and wait for it. That SQE will
- * need async punt anyway, so it's more efficient to do it
- * here.
- */
- if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
- (req->flags & REQ_F_ISREG) &&
- ret2 > 0 && ret2 < io_size)
- ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
} else {
copy_iov:
- ret = io_setup_async_io(req, io_size, iovec,
+ ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
@@ -1814,46 +1876,57 @@ copy_iov:
}
}
out_free:
- kfree(iovec);
+ if (!io_wq_current_is_worker())
+ kfree(iovec);
return ret;
}
-static int io_write_prep(struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool force_nonblock)
+static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
{
+ struct io_async_ctx *io;
+ struct iov_iter iter;
ssize_t ret;
- ret = io_prep_rw(req, force_nonblock);
+ ret = io_prep_rw(req, sqe, force_nonblock);
if (ret)
return ret;
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- return io_import_iovec(WRITE, req, iovec, iter);
+ if (!req->io)
+ return 0;
+
+ io = req->io;
+ io->rw.iov = io->rw.fast_iov;
+ req->io = NULL;
+ ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
+ req->io = io;
+ if (ret < 0)
+ return ret;
+
+ io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+ return 0;
}
static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
- struct kiocb *kiocb = &req->rw;
+ struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter iter;
- struct file *file;
size_t iov_count;
ssize_t ret, io_size;
- if (!req->io) {
- ret = io_write_prep(req, &iovec, &iter, force_nonblock);
- if (ret < 0)
- return ret;
- } else {
- ret = io_import_iovec(WRITE, req, &iovec, &iter);
- if (ret < 0)
- return ret;
- }
+ ret = io_import_iovec(WRITE, req, &iovec, &iter);
+ if (ret < 0)
+ return ret;
+
+ /* Ensure we clear previously set non-block flag */
+ if (!force_nonblock)
+ req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
- file = kiocb->ki_filp;
io_size = ret;
if (req->flags & REQ_F_LINK)
req->result = io_size;
@@ -1873,7 +1946,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
goto copy_iov;
iov_count = iov_iter_count(&iter);
- ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
+ ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
if (!ret) {
ssize_t ret2;
@@ -1885,22 +1958,22 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
* we return to userspace.
*/
if (req->flags & REQ_F_ISREG) {
- __sb_start_write(file_inode(file)->i_sb,
+ __sb_start_write(file_inode(req->file)->i_sb,
SB_FREEZE_WRITE, true);
- __sb_writers_release(file_inode(file)->i_sb,
+ __sb_writers_release(file_inode(req->file)->i_sb,
SB_FREEZE_WRITE);
}
kiocb->ki_flags |= IOCB_WRITE;
- if (file->f_op->write_iter)
- ret2 = call_write_iter(file, kiocb, &iter);
+ if (req->file->f_op->write_iter)
+ ret2 = call_write_iter(req->file, kiocb, &iter);
else
- ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
+ ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
} else {
copy_iov:
- ret = io_setup_async_io(req, io_size, iovec,
+ ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
@@ -1908,7 +1981,8 @@ copy_iov:
}
}
out_free:
- kfree(iovec);
+ if (!io_wq_current_is_worker())
+ kfree(iovec);
return ret;
}
@@ -1939,45 +2013,70 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
+ req->sync.flags = READ_ONCE(sqe->fsync_flags);
+ if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
+ return -EINVAL;
+
+ req->sync.off = READ_ONCE(sqe->off);
+ req->sync.len = READ_ONCE(sqe->len);
return 0;
}
-static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
+static bool io_req_cancelled(struct io_kiocb *req)
{
- loff_t sqe_off = READ_ONCE(sqe->off);
- loff_t sqe_len = READ_ONCE(sqe->len);
- loff_t end = sqe_off + sqe_len;
- unsigned fsync_flags;
- int ret;
+ if (req->work.flags & IO_WQ_WORK_CANCEL) {
+ req_set_fail_links(req);
+ io_cqring_add_event(req, -ECANCELED);
+ io_put_req(req);
+ return true;
+ }
- fsync_flags = READ_ONCE(sqe->fsync_flags);
- if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
- return -EINVAL;
+ return false;
+}
- ret = io_prep_fsync(req, sqe);
- if (ret)
- return ret;
+static void io_fsync_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ loff_t end = req->sync.off + req->sync.len;
+ struct io_kiocb *nxt = NULL;
+ int ret;
- /* fsync always requires a blocking context */
- if (force_nonblock)
- return -EAGAIN;
+ if (io_req_cancelled(req))
+ return;
- ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
+ ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
- fsync_flags & IORING_FSYNC_DATASYNC);
-
+ req->sync.flags & IORING_FSYNC_DATASYNC);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req_find_next(req, &nxt);
+ if (nxt)
+ *workptr = &nxt->work;
+}
+
+static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ struct io_wq_work *work, *old_work;
+
+ /* fsync always requires a blocking context */
+ if (force_nonblock) {
+ io_put_req(req);
+ req->work.func = io_fsync_finish;
+ return -EAGAIN;
+ }
+
+ work = old_work = &req->work;
+ io_fsync_finish(&work);
+ if (work && work != old_work)
+ *nxt = container_of(work, struct io_kiocb, work);
return 0;
}
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- int ret = 0;
if (!req->file)
return -EBADF;
@@ -1987,60 +2086,88 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
- return ret;
+ req->sync.off = READ_ONCE(sqe->off);
+ req->sync.len = READ_ONCE(sqe->len);
+ req->sync.flags = READ_ONCE(sqe->sync_range_flags);
+ return 0;
}
-static int io_sync_file_range(struct io_kiocb *req,
- const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt,
- bool force_nonblock)
+static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
- loff_t sqe_off;
- loff_t sqe_len;
- unsigned flags;
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct io_kiocb *nxt = NULL;
int ret;
- ret = io_prep_sfr(req, sqe);
- if (ret)
- return ret;
+ if (io_req_cancelled(req))
+ return;
+
+ ret = sync_file_range(req->file, req->sync.off, req->sync.len,
+ req->sync.flags);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, &nxt);
+ if (nxt)
+ *workptr = &nxt->work;
+}
+
+static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ struct io_wq_work *work, *old_work;
/* sync_file_range always requires a blocking context */
- if (force_nonblock)
+ if (force_nonblock) {
+ io_put_req(req);
+ req->work.func = io_sync_file_range_finish;
return -EAGAIN;
+ }
- sqe_off = READ_ONCE(sqe->off);
- sqe_len = READ_ONCE(sqe->len);
- flags = READ_ONCE(sqe->sync_range_flags);
+ work = old_work = &req->work;
+ io_sync_file_range_finish(&work);
+ if (work && work != old_work)
+ *nxt = container_of(work, struct io_kiocb, work);
+ return 0;
+}
- ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
+#if defined(CONFIG_NET)
+static void io_sendrecv_async(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct iovec *iov = NULL;
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
- return 0;
+ if (req->io->rw.iov != req->io->rw.fast_iov)
+ iov = req->io->msg.iov;
+ io_wq_submit_work(workptr);
+ kfree(iov);
}
+#endif
-static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
+static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- const struct io_uring_sqe *sqe = req->sqe;
- struct user_msghdr __user *msg;
- unsigned flags;
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct io_async_ctx *io = req->io;
+
+ sr->msg_flags = READ_ONCE(sqe->msg_flags);
+ sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
+ if (!io)
+ return 0;
- flags = READ_ONCE(sqe->msg_flags);
- msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
io->msg.iov = io->msg.fast_iov;
- return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov);
+ return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
+ &io->msg.iov);
#else
- return 0;
+ return -EOPNOTSUPP;
#endif
}
-static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
{
#if defined(CONFIG_NET)
+ struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
int ret;
@@ -2049,46 +2176,52 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
sock = sock_from_file(req->file, &ret);
if (sock) {
- struct io_async_ctx io, *copy;
+ struct io_async_ctx io;
struct sockaddr_storage addr;
- struct msghdr *kmsg;
unsigned flags;
- flags = READ_ONCE(sqe->msg_flags);
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
-
if (req->io) {
- kmsg = &req->io->msg.msg;
- kmsg->msg_name = &addr;
+ kmsg = &req->io->msg;
+ kmsg->msg.msg_name = &addr;
+ /* if iov is set, it's allocated already */
+ if (!kmsg->iov)
+ kmsg->iov = kmsg->fast_iov;
+ kmsg->msg.msg_iter.iov = kmsg->iov;
} else {
- kmsg = &io.msg.msg;
- kmsg->msg_name = &addr;
- ret = io_sendmsg_prep(req, &io);
+ struct io_sr_msg *sr = &req->sr_msg;
+
+ kmsg = &io.msg;
+ kmsg->msg.msg_name = &addr;
+
+ io.msg.iov = io.msg.fast_iov;
+ ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
+ sr->msg_flags, &io.msg.iov);
if (ret)
- goto out;
+ return ret;
}
- ret = __sys_sendmsg_sock(sock, kmsg, flags);
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (force_nonblock && ret == -EAGAIN) {
- copy = kmalloc(sizeof(*copy), GFP_KERNEL);
- if (!copy) {
- ret = -ENOMEM;
- goto out;
- }
- memcpy(&copy->msg, &io.msg, sizeof(copy->msg));
- req->io = copy;
- memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe));
- req->sqe = &req->io->sqe;
- return ret;
+ if (req->io)
+ return -EAGAIN;
+ if (io_alloc_async_ctx(req))
+ return -ENOMEM;
+ memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
+ req->work.func = io_sendrecv_async;
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
-out:
+ if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
+ kfree(kmsg->iov);
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
@@ -2099,27 +2232,32 @@ out:
#endif
}
-static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
+static int io_recvmsg_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- const struct io_uring_sqe *sqe = req->sqe;
- struct user_msghdr __user *msg;
- unsigned flags;
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct io_async_ctx *io = req->io;
+
+ sr->msg_flags = READ_ONCE(sqe->msg_flags);
+ sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
+ if (!io)
+ return 0;
- flags = READ_ONCE(sqe->msg_flags);
- msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
io->msg.iov = io->msg.fast_iov;
- return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr,
- &io->msg.iov);
+ return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
+ &io->msg.uaddr, &io->msg.iov);
#else
- return 0;
+ return -EOPNOTSUPP;
#endif
}
-static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
{
#if defined(CONFIG_NET)
+ struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
int ret;
@@ -2128,49 +2266,54 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
sock = sock_from_file(req->file, &ret);
if (sock) {
- struct user_msghdr __user *msg;
- struct io_async_ctx io, *copy;
+ struct io_async_ctx io;
struct sockaddr_storage addr;
- struct msghdr *kmsg;
unsigned flags;
- flags = READ_ONCE(sqe->msg_flags);
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
- flags |= MSG_DONTWAIT;
-
- msg = (struct user_msghdr __user *) (unsigned long)
- READ_ONCE(sqe->addr);
if (req->io) {
- kmsg = &req->io->msg.msg;
- kmsg->msg_name = &addr;
+ kmsg = &req->io->msg;
+ kmsg->msg.msg_name = &addr;
+ /* if iov is set, it's allocated already */
+ if (!kmsg->iov)
+ kmsg->iov = kmsg->fast_iov;
+ kmsg->msg.msg_iter.iov = kmsg->iov;
} else {
- kmsg = &io.msg.msg;
- kmsg->msg_name = &addr;
- ret = io_recvmsg_prep(req, &io);
+ struct io_sr_msg *sr = &req->sr_msg;
+
+ kmsg = &io.msg;
+ kmsg->msg.msg_name = &addr;
+
+ io.msg.iov = io.msg.fast_iov;
+ ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
+ sr->msg_flags, &io.msg.uaddr,
+ &io.msg.iov);
if (ret)
- goto out;
+ return ret;
}
- ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags);
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
+ kmsg->uaddr, flags);
if (force_nonblock && ret == -EAGAIN) {
- copy = kmalloc(sizeof(*copy), GFP_KERNEL);
- if (!copy) {
- ret = -ENOMEM;
- goto out;
- }
- memcpy(copy, &io, sizeof(*copy));
- req->io = copy;
- memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe));
- req->sqe = &req->io->sqe;
- return ret;
+ if (req->io)
+ return -EAGAIN;
+ if (io_alloc_async_ctx(req))
+ return -ENOMEM;
+ memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
+ req->work.func = io_sendrecv_async;
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
-out:
+ if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
+ kfree(kmsg->iov);
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
@@ -2181,30 +2324,38 @@ out:
#endif
}
-static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
+static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- struct sockaddr __user *addr;
- int __user *addr_len;
- unsigned file_flags;
- int flags, ret;
+ struct io_accept *accept = &req->accept;
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index)
return -EINVAL;
- addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
- addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2);
- flags = READ_ONCE(sqe->accept_flags);
- file_flags = force_nonblock ? O_NONBLOCK : 0;
+ accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ accept->flags = READ_ONCE(sqe->accept_flags);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
- ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags);
- if (ret == -EAGAIN && force_nonblock) {
- req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+#if defined(CONFIG_NET)
+static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ struct io_accept *accept = &req->accept;
+ unsigned file_flags;
+ int ret;
+
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+ ret = __sys_accept4_file(req->file, file_flags, accept->addr,
+ accept->addr_len, accept->flags);
+ if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
- }
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret < 0)
@@ -2212,63 +2363,95 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io)
+static void io_accept_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct io_kiocb *nxt = NULL;
+
+ if (io_req_cancelled(req))
+ return;
+ __io_accept(req, &nxt, false);
+ if (nxt)
+ *workptr = &nxt->work;
+}
+#endif
+
+static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
{
#if defined(CONFIG_NET)
- const struct io_uring_sqe *sqe = req->sqe;
- struct sockaddr __user *addr;
- int addr_len;
+ int ret;
- addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
- addr_len = READ_ONCE(sqe->addr2);
- return move_addr_to_kernel(addr, addr_len, &io->connect.address);
-#else
+ ret = __io_accept(req, nxt, force_nonblock);
+ if (ret == -EAGAIN && force_nonblock) {
+ req->work.func = io_accept_finish;
+ req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+ io_put_req(req);
+ return -EAGAIN;
+ }
return 0;
+#else
+ return -EOPNOTSUPP;
#endif
}
-static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
+static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- struct io_async_ctx __io, *io;
- unsigned file_flags;
- int addr_len, ret;
+ struct io_connect *conn = &req->connect;
+ struct io_async_ctx *io = req->io;
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
- addr_len = READ_ONCE(sqe->addr2);
- file_flags = force_nonblock ? O_NONBLOCK : 0;
+ conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ conn->addr_len = READ_ONCE(sqe->addr2);
+
+ if (!io)
+ return 0;
+
+ return move_addr_to_kernel(conn->addr, conn->addr_len,
+ &io->connect.address);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct io_async_ctx __io, *io;
+ unsigned file_flags;
+ int ret;
if (req->io) {
io = req->io;
} else {
- ret = io_connect_prep(req, &__io);
+ ret = move_addr_to_kernel(req->connect.addr,
+ req->connect.addr_len,
+ &__io.connect.address);
if (ret)
goto out;
io = &__io;
}
- ret = __sys_connect_file(req->file, &io->connect.address, addr_len,
- file_flags);
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+ ret = __sys_connect_file(req->file, &io->connect.address,
+ req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- io = kmalloc(sizeof(*io), GFP_KERNEL);
- if (!io) {
+ if (req->io)
+ return -EAGAIN;
+ if (io_alloc_async_ctx(req)) {
ret = -ENOMEM;
goto out;
}
- memcpy(&io->connect, &__io.connect, sizeof(io->connect));
- req->io = io;
- memcpy(&io->sqe, req->sqe, sizeof(*req->sqe));
- req->sqe = &io->sqe;
+ memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
@@ -2331,23 +2514,32 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
return -ENOENT;
}
+static int io_poll_remove_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+ sqe->poll_events)
+ return -EINVAL;
+
+ req->poll.addr = READ_ONCE(sqe->addr);
+ return 0;
+}
+
/*
* Find a running poll command that matches one specified in sqe->addr,
* and remove it if found.
*/
-static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_poll_remove(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ u64 addr;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->poll_events)
- return -EINVAL;
-
+ addr = req->poll.addr;
spin_lock_irq(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr));
+ ret = io_poll_cancel(ctx, addr);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_add_event(req, ret);
@@ -2482,14 +2674,9 @@ static void io_poll_req_insert(struct io_kiocb *req)
hlist_add_head(&req->hash_node, list);
}
-static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt)
+static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_poll_table ipt;
- bool cancel = false;
- __poll_t mask;
u16 events;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -2499,10 +2686,20 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (!poll->file)
return -EBADF;
- req->io = NULL;
- INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+ return 0;
+}
+
+static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
+{
+ struct io_poll_iocb *poll = &req->poll;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_table ipt;
+ bool cancel = false;
+ __poll_t mask;
+
+ INIT_IO_WORK(&req->work, io_poll_complete_work);
INIT_HLIST_NODE(&req->hash_node);
poll->head = NULL;
@@ -2573,7 +2770,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
/*
* Adjust the reqs sequence before the current one because it
- * will consume a slot in the cq_ring and the the cq_tail
+ * will consume a slot in the cq_ring and the cq_tail
* pointer will be increased, otherwise other timeout reqs may
* return in advance without waiting for enough wait_nr.
*/
@@ -2619,26 +2816,32 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
return 0;
}
+static int io_timeout_remove_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+ return -EINVAL;
+
+ req->timeout.addr = READ_ONCE(sqe->addr);
+ req->timeout.flags = READ_ONCE(sqe->timeout_flags);
+ if (req->timeout.flags)
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* Remove or update an existing timeout command
*/
-static int io_timeout_remove(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_timeout_remove(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned flags;
int ret;
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
- return -EINVAL;
- flags = READ_ONCE(sqe->timeout_flags);
- if (flags)
- return -EINVAL;
-
spin_lock_irq(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr));
+ ret = io_timeout_cancel(ctx, req->timeout.addr);
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
@@ -2650,10 +2853,9 @@ static int io_timeout_remove(struct io_kiocb *req,
return 0;
}
-static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
+static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool is_timeout_link)
{
- const struct io_uring_sqe *sqe = req->sqe;
struct io_timeout_data *data;
unsigned flags;
@@ -2667,7 +2869,12 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL;
- data = &io->timeout;
+ req->timeout.count = READ_ONCE(sqe->off);
+
+ if (!req->io && io_alloc_async_ctx(req))
+ return -ENOMEM;
+
+ data = &req->io->timeout;
data->req = req;
req->flags |= REQ_F_TIMEOUT;
@@ -2680,32 +2887,17 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
data->mode = HRTIMER_MODE_REL;
hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
- req->io = io;
return 0;
}
-static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_timeout(struct io_kiocb *req)
{
unsigned count;
struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data;
- struct io_async_ctx *io;
struct list_head *entry;
unsigned span = 0;
- io = req->io;
- if (!io) {
- int ret;
-
- io = kmalloc(sizeof(*io), GFP_KERNEL);
- if (!io)
- return -ENOMEM;
- ret = io_timeout_prep(req, io, false);
- if (ret) {
- kfree(io);
- return ret;
- }
- }
data = &req->io->timeout;
/*
@@ -2713,7 +2905,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- count = READ_ONCE(sqe->off);
+ count = req->timeout.count;
if (!count) {
req->flags |= REQ_F_TIMEOUT_NOSEQ;
spin_lock_irq(&ctx->completion_lock);
@@ -2831,84 +3023,104 @@ done:
io_put_req_find_next(req, nxt);
}
-static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt)
+static int io_async_cancel_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
sqe->cancel_flags)
return -EINVAL;
- io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0);
+ req->cancel.addr = READ_ONCE(sqe->addr);
return 0;
}
-static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
+static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
- struct iov_iter iter;
- ssize_t ret;
+ struct io_ring_ctx *ctx = req->ctx;
- memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
- req->sqe = &io->sqe;
+ io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
+ return 0;
+}
- switch (io->sqe.opcode) {
+static int io_req_defer_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ ssize_t ret = 0;
+
+ switch (req->opcode) {
+ case IORING_OP_NOP:
+ break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
- ret = io_read_prep(req, &iovec, &iter, true);
+ ret = io_read_prep(req, sqe, true);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
- ret = io_write_prep(req, &iovec, &iter, true);
+ ret = io_write_prep(req, sqe, true);
+ break;
+ case IORING_OP_POLL_ADD:
+ ret = io_poll_add_prep(req, sqe);
+ break;
+ case IORING_OP_POLL_REMOVE:
+ ret = io_poll_remove_prep(req, sqe);
+ break;
+ case IORING_OP_FSYNC:
+ ret = io_prep_fsync(req, sqe);
+ break;
+ case IORING_OP_SYNC_FILE_RANGE:
+ ret = io_prep_sfr(req, sqe);
break;
case IORING_OP_SENDMSG:
- ret = io_sendmsg_prep(req, io);
+ ret = io_sendmsg_prep(req, sqe);
break;
case IORING_OP_RECVMSG:
- ret = io_recvmsg_prep(req, io);
+ ret = io_recvmsg_prep(req, sqe);
break;
case IORING_OP_CONNECT:
- ret = io_connect_prep(req, io);
+ ret = io_connect_prep(req, sqe);
break;
case IORING_OP_TIMEOUT:
- return io_timeout_prep(req, io, false);
+ ret = io_timeout_prep(req, sqe, false);
+ break;
+ case IORING_OP_TIMEOUT_REMOVE:
+ ret = io_timeout_remove_prep(req, sqe);
+ break;
+ case IORING_OP_ASYNC_CANCEL:
+ ret = io_async_cancel_prep(req, sqe);
+ break;
case IORING_OP_LINK_TIMEOUT:
- return io_timeout_prep(req, io, true);
+ ret = io_timeout_prep(req, sqe, true);
+ break;
+ case IORING_OP_ACCEPT:
+ ret = io_accept_prep(req, sqe);
+ break;
default:
- req->io = io;
- return 0;
+ printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
+ req->opcode);
+ ret = -EINVAL;
+ break;
}
- if (ret < 0)
- return ret;
-
- req->io = io;
- io_req_map_io(req, ret, iovec, inline_vecs, &iter);
- return 0;
+ return ret;
}
-static int io_req_defer(struct io_kiocb *req)
+static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_async_ctx *io;
int ret;
/* Still need defer if there is pending req in defer list. */
if (!req_need_defer(req) && list_empty(&ctx->defer_list))
return 0;
- io = kmalloc(sizeof(*io), GFP_KERNEL);
- if (!io)
+ if (!req->io && io_alloc_async_ctx(req))
return -EAGAIN;
- ret = io_req_defer_prep(req, io);
- if (ret < 0) {
- kfree(io);
+ ret = io_req_defer_prep(req, sqe);
+ if (ret < 0)
return ret;
- }
spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
@@ -2922,66 +3134,121 @@ static int io_req_defer(struct io_kiocb *req)
return -EIOCBQUEUED;
}
-__attribute__((nonnull))
-static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt, bool force_nonblock)
{
- int ret, opcode;
struct io_ring_ctx *ctx = req->ctx;
+ int ret;
- opcode = READ_ONCE(req->sqe->opcode);
- switch (opcode) {
+ switch (req->opcode) {
case IORING_OP_NOP:
ret = io_nop(req);
break;
case IORING_OP_READV:
- if (unlikely(req->sqe->buf_index))
- return -EINVAL;
- ret = io_read(req, nxt, force_nonblock);
- break;
- case IORING_OP_WRITEV:
- if (unlikely(req->sqe->buf_index))
- return -EINVAL;
- ret = io_write(req, nxt, force_nonblock);
- break;
case IORING_OP_READ_FIXED:
+ if (sqe) {
+ ret = io_read_prep(req, sqe, force_nonblock);
+ if (ret < 0)
+ break;
+ }
ret = io_read(req, nxt, force_nonblock);
break;
+ case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
+ if (sqe) {
+ ret = io_write_prep(req, sqe, force_nonblock);
+ if (ret < 0)
+ break;
+ }
ret = io_write(req, nxt, force_nonblock);
break;
case IORING_OP_FSYNC:
- ret = io_fsync(req, req->sqe, nxt, force_nonblock);
+ if (sqe) {
+ ret = io_prep_fsync(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_fsync(req, nxt, force_nonblock);
break;
case IORING_OP_POLL_ADD:
- ret = io_poll_add(req, req->sqe, nxt);
+ if (sqe) {
+ ret = io_poll_add_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_poll_add(req, nxt);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req, req->sqe);
+ if (sqe) {
+ ret = io_poll_remove_prep(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_poll_remove(req);
break;
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock);
+ if (sqe) {
+ ret = io_prep_sfr(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_sync_file_range(req, nxt, force_nonblock);
break;
case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, req->sqe, nxt, force_nonblock);
+ if (sqe) {
+ ret = io_sendmsg_prep(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_sendmsg(req, nxt, force_nonblock);
break;
case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, req->sqe, nxt, force_nonblock);
+ if (sqe) {
+ ret = io_recvmsg_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_recvmsg(req, nxt, force_nonblock);
break;
case IORING_OP_TIMEOUT:
- ret = io_timeout(req, req->sqe);
+ if (sqe) {
+ ret = io_timeout_prep(req, sqe, false);
+ if (ret)
+ break;
+ }
+ ret = io_timeout(req);
break;
case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove(req, req->sqe);
+ if (sqe) {
+ ret = io_timeout_remove_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_timeout_remove(req);
break;
case IORING_OP_ACCEPT:
- ret = io_accept(req, req->sqe, nxt, force_nonblock);
+ if (sqe) {
+ ret = io_accept_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_accept(req, nxt, force_nonblock);
break;
case IORING_OP_CONNECT:
- ret = io_connect(req, req->sqe, nxt, force_nonblock);
+ if (sqe) {
+ ret = io_connect_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_connect(req, nxt, force_nonblock);
break;
case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel(req, req->sqe, nxt);
+ if (sqe) {
+ ret = io_async_cancel_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_async_cancel(req, nxt);
break;
default:
ret = -EINVAL;
@@ -3017,9 +3284,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
struct io_kiocb *nxt = NULL;
int ret = 0;
- /* Ensure we clear previously set non-block flag */
- req->rw.ki_flags &= ~IOCB_NOWAIT;
-
if (work->flags & IO_WQ_WORK_CANCEL)
ret = -ECANCELED;
@@ -3027,7 +3291,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
req->in_async = true;
do {
- ret = io_issue_sqe(req, &nxt, false);
+ ret = io_issue_sqe(req, NULL, &nxt, false);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -3067,11 +3331,9 @@ static bool io_req_op_valid(int op)
return op >= IORING_OP_NOP && op < IORING_OP_LAST;
}
-static int io_op_needs_file(const struct io_uring_sqe *sqe)
+static int io_req_needs_file(struct io_kiocb *req)
{
- int op = READ_ONCE(sqe->opcode);
-
- switch (op) {
+ switch (req->opcode) {
case IORING_OP_NOP:
case IORING_OP_POLL_REMOVE:
case IORING_OP_TIMEOUT:
@@ -3080,7 +3342,7 @@ static int io_op_needs_file(const struct io_uring_sqe *sqe)
case IORING_OP_LINK_TIMEOUT:
return 0;
default:
- if (io_req_op_valid(op))
+ if (io_req_op_valid(req->opcode))
return 1;
return -EINVAL;
}
@@ -3095,19 +3357,20 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
return table->files[index & IORING_FILE_TABLE_MASK];
}
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
+static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
int fd, ret;
- flags = READ_ONCE(req->sqe->flags);
- fd = READ_ONCE(req->sqe->fd);
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
if (flags & IOSQE_IO_DRAIN)
req->flags |= REQ_F_IO_DRAIN;
- ret = io_op_needs_file(req->sqe);
+ ret = io_req_needs_file(req);
if (ret <= 0)
return ret;
@@ -3227,14 +3490,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
link_list);
- if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT)
+ if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
req->flags |= REQ_F_LINK_TIMEOUT;
return nxt;
}
-static void __io_queue_sqe(struct io_kiocb *req)
+static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt = NULL;
@@ -3243,7 +3506,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
again:
linked_timeout = io_prep_linked_timeout(req);
- ret = io_issue_sqe(req, &nxt, true);
+ ret = io_issue_sqe(req, sqe, &nxt, true);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -3290,7 +3553,7 @@ done_req:
}
}
-static void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
@@ -3300,7 +3563,7 @@ static void io_queue_sqe(struct io_kiocb *req)
}
req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
- ret = io_req_defer(req);
+ ret = io_req_defer(req, sqe);
if (ret) {
if (ret != -EIOCBQUEUED) {
io_cqring_add_event(req, ret);
@@ -3308,7 +3571,7 @@ static void io_queue_sqe(struct io_kiocb *req)
io_double_put_req(req);
}
} else
- __io_queue_sqe(req);
+ __io_queue_sqe(req, sqe);
}
static inline void io_queue_link_head(struct io_kiocb *req)
@@ -3317,27 +3580,25 @@ static inline void io_queue_link_head(struct io_kiocb *req)
io_cqring_add_event(req, -ECANCELED);
io_double_put_req(req);
} else
- io_queue_sqe(req);
+ io_queue_sqe(req, NULL);
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK)
-static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
- struct io_kiocb **link)
+static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_submit_state *state, struct io_kiocb **link)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
- req->user_data = req->sqe->user_data;
-
/* enforce forwards compatibility on users */
- if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) {
+ if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
goto err_req;
}
- ret = io_req_set_file(state, req);
+ ret = io_req_set_file(state, req, sqe);
if (unlikely(ret)) {
err_req:
io_cqring_add_event(req, ret);
@@ -3354,38 +3615,38 @@ err_req:
*/
if (*link) {
struct io_kiocb *prev = *link;
- struct io_async_ctx *io;
- if (req->sqe->flags & IOSQE_IO_DRAIN)
+ if (sqe->flags & IOSQE_IO_DRAIN)
(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
- if (req->sqe->flags & IOSQE_IO_HARDLINK)
+ if (sqe->flags & IOSQE_IO_HARDLINK)
req->flags |= REQ_F_HARDLINK;
- io = kmalloc(sizeof(*io), GFP_KERNEL);
- if (!io) {
+ if (io_alloc_async_ctx(req)) {
ret = -EAGAIN;
goto err_req;
}
- ret = io_req_defer_prep(req, io);
+ ret = io_req_defer_prep(req, sqe);
if (ret) {
- kfree(io);
/* fail even hard links since we don't submit */
prev->flags |= REQ_F_FAIL_LINK;
goto err_req;
}
trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->link_list, &prev->link_list);
- } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
+ } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
- if (req->sqe->flags & IOSQE_IO_HARDLINK)
+ if (sqe->flags & IOSQE_IO_HARDLINK)
req->flags |= REQ_F_HARDLINK;
INIT_LIST_HEAD(&req->link_list);
+ ret = io_req_defer_prep(req, sqe);
+ if (ret)
+ req->flags |= REQ_F_FAIL_LINK;
*link = req;
} else {
- io_queue_sqe(req);
+ io_queue_sqe(req, sqe);
}
return true;
@@ -3430,14 +3691,15 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
}
/*
- * Fetch an sqe, if one is available. Note that s->sqe will point to memory
+ * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
* that is mapped by userspace. This means that care needs to be taken to
* ensure that reads are stable, as we cannot rely on userspace always
* being a good citizen. If members of the sqe are validated and then later
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe **sqe_ptr)
{
struct io_rings *rings = ctx->rings;
u32 *sq_array = ctx->sq_array;
@@ -3464,7 +3726,9 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
* link list.
*/
req->sequence = ctx->cached_sq_head;
- req->sqe = &ctx->sq_sqes[head];
+ *sqe_ptr = &ctx->sq_sqes[head];
+ req->opcode = READ_ONCE((*sqe_ptr)->opcode);
+ req->user_data = READ_ONCE((*sqe_ptr)->user_data);
ctx->cached_sq_head++;
return true;
}
@@ -3496,6 +3760,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
for (i = 0; i < nr; i++) {
+ const struct io_uring_sqe *sqe;
struct io_kiocb *req;
unsigned int sqe_flags;
@@ -3505,12 +3770,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
submitted = -EAGAIN;
break;
}
- if (!io_get_sqring(ctx, req)) {
+ if (!io_get_sqring(ctx, req, &sqe)) {
__io_free_req(req);
break;
}
- if (io_sqe_needs_user(req->sqe) && !*mm) {
+ if (io_req_needs_user(req) && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) {
use_mm(ctx->sqo_mm);
@@ -3519,22 +3784,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
submitted++;
- sqe_flags = req->sqe->flags;
+ sqe_flags = sqe->flags;
req->ring_file = ring_file;
req->ring_fd = ring_fd;
req->has_user = *mm != NULL;
req->in_async = async;
req->needs_fixed_file = async;
- trace_io_uring_submit_sqe(ctx, req->sqe->user_data,
- true, async);
- if (!io_submit_sqe(req, statep, &link))
+ trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
+ if (!io_submit_sqe(req, sqe, statep, &link))
break;
/*
* If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link.
*/
- if (!(sqe_flags & IOSQE_IO_LINK) && link) {
+ if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
io_queue_link_head(link);
link = NULL;
}
@@ -3694,7 +3958,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
struct io_ring_ctx *ctx = iowq->ctx;
/*
- * Wake up if we have enough events, or if a timeout occured since we
+ * Wake up if we have enough events, or if a timeout occurred since we
* started waiting. For timeouts, we always want to return to userspace,
* regardless of event count.
*/
@@ -4446,7 +4710,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
return -EFAULT;
- dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+ dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
dst->iov_len = ciov.iov_len;
return 0;
}
@@ -4884,6 +5148,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
&cur_mm, false);
mutex_unlock(&ctx->uring_lock);
+
+ if (submitted != to_submit)
+ goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
unsigned nr_events = 0;
@@ -4897,6 +5164,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
}
+out:
percpu_ref_put(&ctx->refs);
out_fput:
fdput(f);
diff --git a/fs/locks.c b/fs/locks.c
index 6970f55daf54..44b6da032842 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2853,7 +2853,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
}
if (inode) {
/* userspace relies on this representation of dev_t */
- seq_printf(f, "%d %02x:%02x:%ld ", fl_pid,
+ seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino);
} else {
diff --git a/fs/namespace.c b/fs/namespace.c
index be601d3a8008..5e1bf611a9eb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1728,7 +1728,7 @@ static bool is_mnt_ns_file(struct dentry *dentry)
dentry->d_fsdata == &mntns_operations;
}
-struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3e77b728a22b..46f225580009 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -57,6 +57,9 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
* doing an __iget/iput with SB_ACTIVE clear would actually
* evict all inodes with zero i_count from icache which is
* unnecessarily violent and may in fact be illegal to do.
+ * However, we should have been called /after/ evict_inodes
+ * removed all zero refcount inodes, in any case. Test to
+ * be sure.
*/
if (!atomic_read(&inode->i_count)) {
spin_unlock(&inode->i_lock);
@@ -77,6 +80,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
iput_inode = inode;
+ cond_resched();
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index a0431642c6b5..f75767bd623a 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -3,6 +3,7 @@
#include <linux/pseudo_fs.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
@@ -11,6 +12,8 @@
#include <linux/nsfs.h>
#include <linux/uaccess.h>
+#include "internal.h"
+
static struct vfsmount *nsfs_mnt;
static long ns_ioctl(struct file *filp, unsigned int ioctl,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c4c51f3df60..cda1027d0819 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3282,6 +3282,7 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
&dlm_debug->d_filter_secs);
+ ocfs2_get_dlm_debug(dlm_debug);
}
static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 1afe57f425a0..68ba354cf361 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1066,6 +1066,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+ if (replayed) {
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/pipe.c b/fs/pipe.c
index 04d004ee2e8c..57502c3c0fba 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -581,7 +581,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
}
wait_event_interruptible(pipe->wait, pipe_writable(pipe));
__pipe_lock(pipe);
- was_empty = pipe_empty(head, pipe->tail);
+ was_empty = pipe_empty(pipe->head, pipe->tail);
}
out:
__pipe_unlock(pipe);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 84ad1c90d535..249672bf54fe 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -631,12 +631,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
/**
* posix_acl_update_mode - update mode in set_acl
+ * @inode: target inode
+ * @mode_p: mode (pointer) for update
+ * @acl: acl pointer
*
* Update the file mode when setting an ACL: compute the new file permission
* bits based on the ACL. In addition, if the ACL is equivalent to the new
- * file mode, set *acl to NULL to indicate that no ACL should be set.
+ * file mode, set *@acl to NULL to indicate that no ACL should be set.
*
- * As with chmod, clear the setgit bit if the caller is not in the owning group
+ * As with chmod, clear the setgid bit if the caller is not in the owning group
* or capable of CAP_FSETID (see inode_change_ok).
*
* Called from set_acl inode operations.
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 37bdbec5b402..fd931d3e77be 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -134,7 +134,7 @@ static int show_stat(struct seq_file *p, void *v)
softirq += cpustat[CPUTIME_SOFTIRQ];
steal += cpustat[CPUTIME_STEAL];
guest += cpustat[CPUTIME_GUEST];
- guest_nice += cpustat[CPUTIME_USER];
+ guest_nice += cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -175,7 +175,7 @@ static int show_stat(struct seq_file *p, void *v)
softirq = cpustat[CPUTIME_SOFTIRQ];
steal = cpustat[CPUTIME_STEAL];
guest = cpustat[CPUTIME_GUEST];
- guest_nice = cpustat[CPUTIME_USER];
+ guest_nice = cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8caff834f002..013486b5125e 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -407,6 +407,17 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
prz = cxt->dprzs[cxt->dump_write_cnt];
+ /*
+ * Since this is a new crash dump, we need to reset the buffer in
+ * case it still has an old dump present. Without this, the new dump
+ * will get appended, which would seriously confuse anything trying
+ * to check dump file contents. Specifically, ramoops_read_kmsg_hdr()
+ * expects to find a dump header in the beginning of buffer data, so
+ * we must to reset the buffer values, in order to ensure that the
+ * header will be written to the beginning of the buffer.
+ */
+ persistent_ram_zap(prz);
+
/* Build header and append record contents. */
hlen = ramoops_write_kmsg_hdr(prz, record);
if (!hlen)
@@ -572,6 +583,7 @@ static int ramoops_init_przs(const char *name,
prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
&cxt->ecc_info,
cxt->memtype, flags, label);
+ kfree(label);
if (IS_ERR(prz_ar[i])) {
err = PTR_ERR(prz_ar[i]);
dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
@@ -617,6 +629,7 @@ static int ramoops_init_prz(const char *name,
label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
cxt->memtype, PRZ_FLAG_ZAP_OLD, label);
+ kfree(label);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 8823f65888f0..1f4d8c06f9be 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -574,7 +574,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
/* Initialize general buffer state. */
raw_spin_lock_init(&prz->buffer_lock);
prz->flags = flags;
- prz->label = label;
+ prz->label = kstrdup(label, GFP_KERNEL);
ret = persistent_ram_buffer_map(start, size, prz, memtype);
if (ret)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b0688c02dc90..b6a4f692d345 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -984,6 +984,7 @@ static int add_dquot_ref(struct super_block *sb, int type)
* later.
*/
old_inode = inode;
+ cond_resched();
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/super.c b/fs/super.c
index cfadab2cbf35..cd352530eca9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -448,10 +448,12 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
- fsnotify_sb_delete(sb);
cgroup_writeback_umount();
+ /* evict all inodes with zero refcount */
evict_inodes(sb);
+ /* only nonzero refcount inodes can have marks */
+ fsnotify_sb_delete(sb);
if (sb->s_dio_done_wq) {
destroy_workqueue(sb->s_dio_done_wq);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index c284e10af491..fc93fd88ec89 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2248,24 +2248,32 @@ xfs_alloc_longest_free_extent(
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
+/*
+ * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
+ * return the largest possible minimum length.
+ */
unsigned int
xfs_alloc_min_freelist(
struct xfs_mount *mp,
struct xfs_perag *pag)
{
+ /* AG btrees have at least 1 level. */
+ static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
+ const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
unsigned int min_free;
+ ASSERT(mp->m_ag_maxlevels > 0);
+
/* space needed by-bno freespace btree */
- min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
mp->m_ag_maxlevels);
/* space needed by-size freespace btree */
- min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
mp->m_ag_maxlevels);
/* space needed reverse mapping used space btree */
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- min_free += min_t(unsigned int,
- pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
+ mp->m_rmap_maxlevels);
return min_free;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a9ad1f991ba3..4c2e046fbfad 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4561,7 +4561,7 @@ xfs_bmapi_convert_delalloc(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
- u16 flags = 0;
+ uint16_t flags = 0;
struct xfs_trans *tp;
int error;
@@ -5972,8 +5972,7 @@ xfs_bmap_insert_extents(
goto del_cursor;
}
- if (XFS_IS_CORRUPT(mp,
- stop_fsb >= got.br_startoff + got.br_blockcount)) {
+ if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
error = -EFSCORRUPTED;
goto del_cursor;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 0aa87cbde49e..dd6fcaaea318 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -724,3 +724,24 @@ xfs_dir2_namecheck(
/* There shouldn't be any slashes or nulls here */
return !memchr(name, '/', length) && !memchr(name, 0, length);
}
+
+xfs_dahash_t
+xfs_dir2_hashname(
+ struct xfs_mount *mp,
+ struct xfs_name *name)
+{
+ if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb)))
+ return xfs_ascii_ci_hashname(name);
+ return xfs_da_hashname(name->name, name->len);
+}
+
+enum xfs_dacmp
+xfs_dir2_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb)))
+ return xfs_ascii_ci_compname(args, name, len);
+ return xfs_da_compname(args, name, len);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index c031c53d0f0d..01ee0b926572 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -175,6 +175,12 @@ extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+int xfs_dir2_sf_entsize(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr, int len);
+void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino);
+void xfs_dir2_sf_put_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep, uint8_t ftype);
/* xfs_dir2_readdir.c */
extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
@@ -194,25 +200,8 @@ xfs_dir2_data_entsize(
return round_up(len, XFS_DIR2_DATA_ALIGN);
}
-static inline xfs_dahash_t
-xfs_dir2_hashname(
- struct xfs_mount *mp,
- struct xfs_name *name)
-{
- if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb)))
- return xfs_ascii_ci_hashname(name);
- return xfs_da_hashname(name->name, name->len);
-}
-
-static inline enum xfs_dacmp
-xfs_dir2_compname(
- struct xfs_da_args *args,
- const unsigned char *name,
- int len)
-{
- if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb)))
- return xfs_ascii_ci_compname(args, name, len);
- return xfs_da_compname(args, name, len);
-}
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name);
+enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 8b94d33d232f..7b7f6fb2ea3b 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -37,7 +37,7 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args);
static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
-static int
+int
xfs_dir2_sf_entsize(
struct xfs_mount *mp,
struct xfs_dir2_sf_hdr *hdr,
@@ -84,7 +84,7 @@ xfs_dir2_sf_get_ino(
return get_unaligned_be64(from) & XFS_MAXINUMBER;
}
-static void
+void
xfs_dir2_sf_put_ino(
struct xfs_mount *mp,
struct xfs_dir2_sf_hdr *hdr,
@@ -145,7 +145,7 @@ xfs_dir2_sf_get_ftype(
return XFS_DIR3_FT_UNKNOWN;
}
-static void
+void
xfs_dir2_sf_put_ftype(
struct xfs_mount *mp,
struct xfs_dir2_sf_entry *sfep,
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 988cde7744e6..5b759af4d165 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2909,3 +2909,67 @@ xfs_ialloc_setup_geometry(
else
igeo->ialloc_align = 0;
}
+
+/* Compute the location of the root directory inode that is laid out by mkfs. */
+xfs_ino_t
+xfs_ialloc_calc_rootino(
+ struct xfs_mount *mp,
+ int sunit)
+{
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t first_bno;
+
+ /*
+ * Pre-calculate the geometry of AG 0. We know what it looks like
+ * because libxfs knows how to create allocation groups now.
+ *
+ * first_bno is the first block in which mkfs could possibly have
+ * allocated the root directory inode, once we factor in the metadata
+ * that mkfs formats before it. Namely, the four AG headers...
+ */
+ first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+
+ /* ...the two free space btree roots... */
+ first_bno += 2;
+
+ /* ...the inode btree root... */
+ first_bno += 1;
+
+ /* ...the initial AGFL... */
+ first_bno += xfs_alloc_min_freelist(mp, NULL);
+
+ /* ...the free inode btree root... */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ first_bno++;
+
+ /* ...the reverse mapping btree root... */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ first_bno++;
+
+ /* ...the reference count btree... */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ first_bno++;
+
+ /*
+ * ...and the log, if it is allocated in the first allocation group.
+ *
+ * This can happen with filesystems that only have a single
+ * allocation group, or very odd geometries created by old mkfs
+ * versions on very small filesystems.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0)
+ first_bno += mp->m_sb.sb_logblocks;
+
+ /*
+ * Now round first_bno up to whatever allocation alignment is given
+ * by the filesystem or was passed in.
+ */
+ if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0)
+ first_bno = roundup(first_bno, sunit);
+ else if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ mp->m_sb.sb_inoalignmt > 1)
+ first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
+
+ return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 323592d563d5..72b3468b97b1 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
+xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index c55cd9a3dec9..7a9c04920505 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res(
}
/*
+ * Per-extent log reservation for the btree changes involved in freeing or
+ * allocating a realtime extent. We have to be able to log as many rtbitmap
+ * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
+ * as well as the realtime summary block.
+ */
+static unsigned int
+xfs_rtalloc_log_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+ unsigned int rtbmp_bytes;
+
+ rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+ return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
+}
+
+/*
* Various log reservation values.
*
* These are based on the size of the file system block because that is what
@@ -218,13 +236,21 @@ xfs_calc_inode_chunk_res(
/*
* In a write transaction we can allocate a maximum of 2
- * extents. This gives:
+ * extents. This gives (t1):
* the inode getting the new extents: inode size
* the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
* the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
+ * Or, if we're writing to a realtime file (t2):
+ * the inode getting the new extents: inode size
+ * the inode's bmap btree: max depth * block size
+ * the agfs of the ags from which the extents are allocated: 2 * sector
+ * the superblock free block counter: sector size
+ * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 1 block
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join (t3):
* the agfs of the ags containing the blocks: 2 * sector size
* the agfls of the ags containing the blocks: 2 * sector size
* the super block free block counter: sector size
@@ -234,40 +260,72 @@ STATIC uint
xfs_calc_write_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- max((xfs_calc_inode_res(mp, 1) +
+ unsigned int t1, t2, t3;
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ t1 = xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
+ xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+
+ if (xfs_sb_version_hasrealtime(&mp->m_sb)) {
+ t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
- XFS_FSB_TO_B(mp, 1)) +
+ blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
- XFS_FSB_TO_B(mp, 1))));
+ xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz);
+ } else {
+ t2 = 0;
+ }
+
+ t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
}
/*
- * In truncating a file we free up to two extents at once. We can modify:
+ * In truncating a file we free up to two extents at once. We can modify (t1):
* the inode being truncated: inode size
* the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * And the bmap_finish transaction can free the blocks and bmap blocks (t2):
* the agf for each of the ags: 4 * sector size
* the agfl for each of the ags: 4 * sector size
* the super block to reflect the freed blocks: sector size
* worst case split in allocation btrees per extent assuming 4 extents:
* 4 exts * 2 trees * (2 * max depth - 1) * block size
+ * Or, if it's a realtime file (t3):
+ * the agf for each of the ags: 2 * sector size
+ * the agfl for each of the ags: 2 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 2 exts * 1 block
+ * worst case split in allocation btrees per extent assuming 2 extents:
+ * 2 exts * 2 trees * (2 * max depth - 1) * block size
*/
STATIC uint
xfs_calc_itruncate_reservation(
struct xfs_mount *mp)
{
- return XFS_DQUOT_LOGRES(mp) +
- max((xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
- XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
- XFS_FSB_TO_B(mp, 1))));
+ unsigned int t1, t2, t3;
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ t1 = xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
+
+ t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
+
+ if (xfs_sb_version_hasrealtime(&mp->m_sb)) {
+ t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ } else {
+ t3 = 0;
+ }
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
}
/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2efd78a9719e..e62fb5216341 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -992,6 +992,7 @@ xfs_prepare_shift(
struct xfs_inode *ip,
loff_t offset)
{
+ struct xfs_mount *mp = ip->i_mount;
int error;
/*
@@ -1005,6 +1006,17 @@ xfs_prepare_shift(
}
/*
+ * Shift operations must stabilize the start block offset boundary along
+ * with the full range of the operation. If we don't, a COW writeback
+ * completion could race with an insert, front merge with the start
+ * extent (after split) during the shift and corrupt the file. Start
+ * with the block just prior to the start to stabilize the boundary.
+ */
+ offset = round_down(offset, 1 << mp->m_sb.sb_blocklog);
+ if (offset)
+ offset -= (1 << mp->m_sb.sb_blocklog);
+
+ /*
* Writeback and invalidate cache for the remainder of the file as we're
* about to shift down every extent from offset to EOF.
*/
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3458a1264a3f..3984779e5911 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -956,7 +956,7 @@ xfs_buf_item_relse(
struct xfs_buf_log_item *bip = bp->b_log_item;
trace_xfs_buf_item_relse(bp, _RET_IP_);
- ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
bp->b_log_item = NULL;
if (list_empty(&bp->b_li_list))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fca65109cf24..56efe140c923 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -31,7 +31,7 @@
#include "xfs_reflink.h"
#include "xfs_extent_busy.h"
#include "xfs_health.h"
-
+#include "xfs_trace.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -360,66 +360,119 @@ release_buf:
}
/*
- * Update alignment values based on mount options and sb values
+ * If the sunit/swidth change would move the precomputed root inode value, we
+ * must reject the ondisk change because repair will stumble over that.
+ * However, we allow the mount to proceed because we never rejected this
+ * combination before. Returns true to update the sb, false otherwise.
+ */
+static inline int
+xfs_check_new_dalign(
+ struct xfs_mount *mp,
+ int new_dalign,
+ bool *update_sb)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ xfs_ino_t calc_ino;
+
+ calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
+ trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
+
+ if (sbp->sb_rootino == calc_ino) {
+ *update_sb = true;
+ return 0;
+ }
+
+ xfs_warn(mp,
+"Cannot change stripe alignment; would require moving root inode.");
+
+ /*
+ * XXX: Next time we add a new incompat feature, this should start
+ * returning -EINVAL to fail the mount. Until then, spit out a warning
+ * that we're ignoring the administrator's instructions.
+ */
+ xfs_warn(mp, "Skipping superblock stripe alignment update.");
+ *update_sb = false;
+ return 0;
+}
+
+/*
+ * If we were provided with new sunit/swidth values as mount options, make sure
+ * that they pass basic alignment and superblock feature checks, and convert
+ * them into the same units (FSB) that everything else expects. This step
+ * /must/ be done before computing the inode geometry.
*/
STATIC int
-xfs_update_alignment(xfs_mount_t *mp)
+xfs_validate_new_dalign(
+ struct xfs_mount *mp)
{
- xfs_sb_t *sbp = &(mp->m_sb);
+ if (mp->m_dalign == 0)
+ return 0;
- if (mp->m_dalign) {
+ /*
+ * If stripe unit and stripe width are not multiples
+ * of the fs blocksize turn off alignment.
+ */
+ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+ (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
+ mp->m_sb.sb_blocksize);
+ return -EINVAL;
+ } else {
/*
- * If stripe unit and stripe width are not multiples
- * of the fs blocksize turn off alignment.
+ * Convert the stripe unit and width to FSBs.
*/
- if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
- (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. blocksize(%d)",
- sbp->sb_blocksize);
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ mp->m_sb.sb_agblocks);
return -EINVAL;
- } else {
- /*
- * Convert the stripe unit and width to FSBs.
- */
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
- if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
- xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. agsize(%d)",
- sbp->sb_agblocks);
- return -EINVAL;
- } else if (mp->m_dalign) {
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
- } else {
- xfs_warn(mp,
- "alignment check failed: sunit(%d) less than bsize(%d)",
- mp->m_dalign, sbp->sb_blocksize);
- return -EINVAL;
- }
- }
-
- /*
- * Update superblock with new values
- * and log changes
- */
- if (xfs_sb_version_hasdalign(sbp)) {
- if (sbp->sb_unit != mp->m_dalign) {
- sbp->sb_unit = mp->m_dalign;
- mp->m_update_sb = true;
- }
- if (sbp->sb_width != mp->m_swidth) {
- sbp->sb_width = mp->m_swidth;
- mp->m_update_sb = true;
- }
+ } else if (mp->m_dalign) {
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
xfs_warn(mp,
- "cannot change alignment: superblock does not support data alignment");
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, mp->m_sb.sb_blocksize);
return -EINVAL;
}
+ }
+
+ if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
+ xfs_warn(mp,
+"cannot change alignment: superblock does not support data alignment");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Update alignment values based on mount options and sb values. */
+STATIC int
+xfs_update_alignment(
+ struct xfs_mount *mp)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+
+ if (mp->m_dalign) {
+ bool update_sb;
+ int error;
+
+ if (sbp->sb_unit == mp->m_dalign &&
+ sbp->sb_width == mp->m_swidth)
+ return 0;
+
+ error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
+ if (error || !update_sb)
+ return error;
+
+ sbp->sb_unit = mp->m_dalign;
+ sbp->sb_width = mp->m_swidth;
+ mp->m_update_sb = true;
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
- mp->m_dalign = sbp->sb_unit;
- mp->m_swidth = sbp->sb_width;
+ mp->m_dalign = sbp->sb_unit;
+ mp->m_swidth = sbp->sb_width;
}
return 0;
@@ -648,12 +701,12 @@ xfs_mountfs(
}
/*
- * Check if sb_agblocks is aligned at stripe boundary
- * If sb_agblocks is NOT aligned turn off m_dalign since
- * allocator alignment is within an ag, therefore ag has
- * to be aligned at stripe boundary.
+ * If we were given new sunit/swidth options, do some basic validation
+ * checks and convert the incore dalign and swidth values to the
+ * same units (FSB) that everything else uses. This /must/ happen
+ * before computing the inode geometry.
*/
- error = xfs_update_alignment(mp);
+ error = xfs_validate_new_dalign(mp);
if (error)
goto out;
@@ -664,6 +717,17 @@ xfs_mountfs(
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ /*
+ * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
+ * is NOT aligned turn off m_dalign since allocator alignment is within
+ * an ag, therefore ag has to be aligned at stripe boundary. Note that
+ * we must compute the free space and rmap btree geometry before doing
+ * this.
+ */
+ error = xfs_update_alignment(mp);
+ if (error)
+ goto out;
+
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c13bb3655e48..a86be7f807ee 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3573,6 +3573,27 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
DEFINE_KMEM_EVENT(kmem_realloc);
DEFINE_KMEM_EVENT(kmem_zone_alloc);
+TRACE_EVENT(xfs_check_new_dalign,
+ TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
+ TP_ARGS(mp, new_dalign, calc_rootino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, new_dalign)
+ __field(xfs_ino_t, sb_rootino)
+ __field(xfs_ino_t, calc_rootino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->new_dalign = new_dalign;
+ __entry->sb_rootino = mp->m_sb.sb_rootino;
+ __entry->calc_rootino = calc_rootino;
+ ),
+ TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->new_dalign, __entry->sb_rootino,
+ __entry->calc_rootino)
+)
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH