aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/extent_map.c11
-rw-r--r--fs/btrfs/inode.c30
-rw-r--r--fs/btrfs/ordered-data.c7
-rw-r--r--fs/btrfs/qgroup.c13
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/ref-verify.c5
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/sysfs.c17
-rw-r--r--fs/btrfs/transaction.c2
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/ceph/file.c17
-rw-r--r--fs/ceph/super.c129
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsfs.c8
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/file.c19
-rw-r--r--fs/cifs/inode.c18
-rw-r--r--fs/cifs/smb1ops.c2
-rw-r--r--fs/cifs/smb2inode.c4
-rw-r--r--fs/cifs/smb2ops.c39
-rw-r--r--fs/cifs/smb2pdu.c1
-rw-r--r--fs/crypto/keysetup.c9
-rw-r--r--fs/dax.c11
-rw-r--r--fs/debugfs/file.c17
-rw-r--r--fs/ecryptfs/crypto.c6
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h44
-rw-r--r--fs/ext4/ialloc.c23
-rw-r--r--fs/ext4/inode.c30
-rw-r--r--fs/ext4/mballoc.c61
-rw-r--r--fs/ext4/migrate.c27
-rw-r--r--fs/ext4/mmp.c12
-rw-r--r--fs/ext4/namei.c8
-rw-r--r--fs/ext4/resize.c62
-rw-r--r--fs/ext4/super.c153
-rw-r--r--fs/fat/inode.c19
-rw-r--r--fs/fcntl.c6
-rw-r--r--fs/io-wq.c139
-rw-r--r--fs/io-wq.h18
-rw-r--r--fs/io_uring.c507
-rw-r--r--fs/jbd2/commit.c46
-rw-r--r--fs/jbd2/transaction.c26
-rw-r--r--fs/locks.c14
-rw-r--r--fs/nfs/delegation.c50
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c126
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4proc.c20
-rw-r--r--fs/pipe.c18
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/zonefs/Kconfig1
-rw-r--r--fs/zonefs/super.c8
67 files changed, 1197 insertions, 669 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7fa9bb79ad08..c6c9a6a8e6c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3164,6 +3164,7 @@ int __cold open_ctree(struct super_block *sb,
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_info(fs_info, "start tree-log replay");
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3199,6 +3200,7 @@ int __cold open_ctree(struct super_block *sb,
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+ fs_info->fs_root = NULL;
goto fail_qgroup;
}
@@ -4275,6 +4277,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
cond_resched();
spin_lock(&delayed_refs->lock);
}
+ btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
@@ -4500,7 +4503,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
wake_up(&fs_info->transaction_wait);
btrfs_destroy_delayed_inodes(fs_info);
- btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0163fdd59f8f..a7bc66121330 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4430,6 +4430,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1);
+ if (ret)
+ btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
btrfs_put_block_group(block_group);
return ret;
}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6f417ff68980..bd6229fb2b6f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -237,6 +237,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
struct extent_map *merge = NULL;
struct rb_node *rb;
+ /*
+ * We can't modify an extent map that is in the tree and that is being
+ * used by another task, as it can cause that other task to see it in
+ * inconsistent state during the merging. We always have 1 reference for
+ * the tree and 1 for this task (which is unpinning the extent map or
+ * clearing the logging flag), so anything > 2 means it's being used by
+ * other tasks too.
+ */
+ if (refcount_read(&em->refs) > 2)
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b3ec93ff911..27076ebadb36 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4085,6 +4085,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+ struct extent_state *cached_state = NULL;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
@@ -4101,6 +4103,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = READA_BACK;
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+
/*
* We want to drop from the next block forward in case this new size is
* not block aligned since we will be keeping the last block of the
@@ -4137,7 +4143,6 @@ search_again:
goto out;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
@@ -4289,7 +4294,6 @@ delete:
root == fs_info->tree_root)) {
struct btrfs_ref ref = { 0 };
- btrfs_set_path_blocking(path);
bytes_deleted += extent_num_bytes;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
@@ -4365,6 +4369,8 @@ out:
if (!ret && last_size > new_size)
last_size = new_size;
btrfs_ordered_update_i_size(inode, last_size, NULL);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
+ (u64)-1, &cached_state);
}
btrfs_free_path(path);
@@ -7777,6 +7783,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
{
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+ u16 csum_size;
blk_status_t ret;
/*
@@ -7796,7 +7803,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
file_offset -= dip->logical_offset;
file_offset >>= inode->i_sb->s_blocksize_bits;
- io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+ csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
+ io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
return 0;
}
@@ -9818,6 +9826,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
+ u64 clear_offset = start;
u64 i_size;
u64 cur_bytes;
u64 last_alloc = (u64)-1;
@@ -9852,6 +9861,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_end_transaction(trans);
break;
}
+
+ /*
+ * We've reserved this space, and thus converted it from
+ * ->bytes_may_use to ->bytes_reserved. Any error that happens
+ * from here on out we will only need to clear our reservation
+ * for the remaining unreserved area, so advance our
+ * clear_offset by our extent size.
+ */
+ clear_offset += ins.offset;
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
@@ -9931,9 +9949,9 @@ next:
if (own_trans)
btrfs_end_transaction(trans);
}
- if (cur_offset < end)
- btrfs_free_reserved_data_space(inode, NULL, cur_offset,
- end - cur_offset + 1);
+ if (clear_offset < end)
+ btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+ end - clear_offset + 1);
return ret;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ecb9fb6a6fe0..a65f189a5b94 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -679,10 +679,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
}
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
+ /*
+ * If the ordered extent had an error save the error but don't
+ * exit without waiting first for all other ordered extents in
+ * the range to complete.
+ */
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
ret = -EIO;
btrfs_put_ordered_extent(ordered);
- if (ret || end == 0 || end == start)
+ if (end == 0 || end == start)
break;
end--;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98d9a50352d6..ff1870ff3474 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4002,3 +4002,16 @@ out:
}
return ret;
}
+
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
+{
+ struct btrfs_qgroup_extent_record *entry;
+ struct btrfs_qgroup_extent_record *next;
+ struct rb_root *root;
+
+ root = &trans->delayed_refs.dirty_extent_root;
+ rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
+ ulist_free(entry->old_roots);
+ kfree(entry);
+ }
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 236f12224d52..1bc654459469 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
u64 last_snapshot);
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index b57f3618e58e..454a1015d026 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -744,6 +744,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
*/
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
if (IS_ERR(be)) {
+ kfree(ref);
kfree(ra);
ret = PTR_ERR(be);
goto out;
@@ -757,6 +758,8 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"re-allocated a block that still has references to it!");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
goto out_unlock;
}
@@ -819,6 +822,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a existing root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
@@ -834,6 +838,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"attempting to add another ref for an existing ref on a tree block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0616a5434793..67c63858812a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1834,6 +1834,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
ret = -EINVAL;
goto restore;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7436422194da..3c10e78924d0 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -901,6 +901,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
+ if (fs_devs->devinfo_kobj) {
+ kobject_del(fs_devs->devinfo_kobj);
+ kobject_put(fs_devs->devinfo_kobj);
+ fs_devs->devinfo_kobj = NULL;
+ }
+
if (fs_devs->devices_kobj) {
kobject_del(fs_devs->devices_kobj);
kobject_put(fs_devs->devices_kobj);
@@ -1289,7 +1295,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
init_completion(&dev->kobj_unregister);
error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devices_kobj, "%llu",
+ fs_devices->devinfo_kobj, "%llu",
dev->devid);
if (error) {
kobject_put(&dev->devid_kobj);
@@ -1369,6 +1375,15 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
return -ENOMEM;
}
+ fs_devs->devinfo_kobj = kobject_create_and_add("devinfo",
+ &fs_devs->fsid_kobj);
+ if (!fs_devs->devinfo_kobj) {
+ btrfs_err(fs_devs->fs_info,
+ "failed to init sysfs devinfo kobject");
+ btrfs_sysfs_remove_fsid(fs_devs);
+ return -ENOMEM;
+ }
+
return 0;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 33dcc88b428a..beb6c69cd1e5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -121,6 +121,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(
&transaction->delayed_refs.href_root.rb_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.dirty_extent_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 409f4816fb89..f01552a0785e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -258,6 +258,7 @@ struct btrfs_fs_devices {
/* sysfs kobjects */
struct kobject fsid_kobj;
struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
struct completion kobj_unregister;
};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c3b8e8e0bf17..7e0190b1f821 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1418,6 +1418,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
int err, want, got;
+ bool direct_lock = false;
loff_t pos;
loff_t limit = max(i_size_read(inode), fsc->max_file_size);
@@ -1428,8 +1429,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!prealloc_cf)
return -ENOMEM;
+ if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT)
+ direct_lock = true;
+
retry_snap:
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_start_io_direct(inode);
else
ceph_start_io_write(inode);
@@ -1519,14 +1523,15 @@ retry_snap:
/* we might need to revert back to that point */
data = *from;
- if (iocb->ki_flags & IOCB_DIRECT) {
+ if (iocb->ki_flags & IOCB_DIRECT)
written = ceph_direct_read_write(iocb, &data, snapc,
&prealloc_cf);
- ceph_end_io_direct(inode);
- } else {
+ else
written = ceph_sync_write(iocb, &data, pos, snapc);
+ if (direct_lock)
+ ceph_end_io_direct(inode);
+ else
ceph_end_io_write(inode);
- }
if (written > 0)
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
@@ -1577,7 +1582,7 @@ retry_snap:
goto out_unlocked;
out:
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_write(inode);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1d9f083b8a11..c7f150686a53 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -203,6 +203,26 @@ struct ceph_parse_opts_ctx {
};
/*
+ * Remove adjacent slashes and then the trailing slash, unless it is
+ * the only remaining character.
+ *
+ * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/".
+ */
+static void canonicalize_path(char *path)
+{
+ int i, j = 0;
+
+ for (i = 0; path[i] != '\0'; i++) {
+ if (path[i] != '/' || j < 1 || path[j - 1] != '/')
+ path[j++] = path[i];
+ }
+
+ if (j > 1 && path[j - 1] == '/')
+ j--;
+ path[j] = '\0';
+}
+
+/*
* Parse the source parameter. Distinguish the server list from the path.
*
* The source will look like:
@@ -224,15 +244,16 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
dev_name_end = strchr(dev_name, '/');
if (dev_name_end) {
- kfree(fsopt->server_path);
-
/*
* The server_path will include the whole chars from userland
* including the leading '/'.
*/
+ kfree(fsopt->server_path);
fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
if (!fsopt->server_path)
return -ENOMEM;
+
+ canonicalize_path(fsopt->server_path);
} else {
dev_name_end = dev_name + strlen(dev_name);
}
@@ -456,73 +477,6 @@ static int strcmp_null(const char *s1, const char *s2)
return strcmp(s1, s2);
}
-/**
- * path_remove_extra_slash - Remove the extra slashes in the server path
- * @server_path: the server path and could be NULL
- *
- * Return NULL if the path is NULL or only consists of "/", or a string
- * without any extra slashes including the leading slash(es) and the
- * slash(es) at the end of the server path, such as:
- * "//dir1////dir2///" --> "dir1/dir2"
- */
-static char *path_remove_extra_slash(const char *server_path)
-{
- const char *path = server_path;
- const char *cur, *end;
- char *buf, *p;
- int len;
-
- /* if the server path is omitted */
- if (!path)
- return NULL;
-
- /* remove all the leading slashes */
- while (*path == '/')
- path++;
-
- /* if the server path only consists of slashes */
- if (*path == '\0')
- return NULL;
-
- len = strlen(path);
-
- buf = kmalloc(len + 1, GFP_KERNEL);
- if (!buf)
- return ERR_PTR(-ENOMEM);
-
- end = path + len;
- p = buf;
- do {
- cur = strchr(path, '/');
- if (!cur)
- cur = end;
-
- len = cur - path;
-
- /* including one '/' */
- if (cur != end)
- len += 1;
-
- memcpy(p, path, len);
- p += len;
-
- while (cur <= end && *cur == '/')
- cur++;
- path = cur;
- } while (path < end);
-
- *p = '\0';
-
- /*
- * remove the last slash if there has and just to make sure that
- * we will get something like "dir1/dir2"
- */
- if (*(--p) == '/')
- *p = '\0';
-
- return buf;
-}
-
static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_options *new_opt,
struct ceph_fs_client *fsc)
@@ -530,7 +484,6 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_mount_options *fsopt1 = new_fsopt;
struct ceph_mount_options *fsopt2 = fsc->mount_options;
int ofs = offsetof(struct ceph_mount_options, snapdir_name);
- char *p1, *p2;
int ret;
ret = memcmp(fsopt1, fsopt2, ofs);
@@ -540,21 +493,12 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
if (ret)
return ret;
+
ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
if (ret)
return ret;
- p1 = path_remove_extra_slash(fsopt1->server_path);
- if (IS_ERR(p1))
- return PTR_ERR(p1);
- p2 = path_remove_extra_slash(fsopt2->server_path);
- if (IS_ERR(p2)) {
- kfree(p1);
- return PTR_ERR(p2);
- }
- ret = strcmp_null(p1, p2);
- kfree(p1);
- kfree(p2);
+ ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
return ret;
@@ -957,7 +901,9 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
- const char *path, *p;
+ const char *path = fsc->mount_options->server_path ?
+ fsc->mount_options->server_path + 1 : "";
+
err = __ceph_open_session(fsc->client, started);
if (err < 0)
goto out;
@@ -969,22 +915,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto out;
}
- p = path_remove_extra_slash(fsc->mount_options->server_path);
- if (IS_ERR(p)) {
- err = PTR_ERR(p);
- goto out;
- }
- /* if the server path is omitted or just consists of '/' */
- if (!p)
- path = "";
- else
- path = p;
dout("mount opening path '%s'\n", path);
ceph_fs_debugfs_init(fsc);
root = open_root_dentry(fsc, path, started);
- kfree(p);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
@@ -1097,10 +1032,6 @@ static int ceph_get_tree(struct fs_context *fc)
if (!fc->source)
return invalfc(fc, "No source");
-#ifdef CONFIG_CEPH_FS_POSIX_ACL
- fc->sb_flags |= SB_POSIXACL;
-#endif
-
/* create client (which we may/may not use) */
fsc = create_fs_client(pctx->opts, pctx->copts);
pctx->opts = NULL;
@@ -1223,6 +1154,10 @@ static int ceph_init_fs_context(struct fs_context *fc)
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+
fc->fs_private = pctx;
fc->ops = &ceph_context_ops;
return 0;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1e456a9011bb..037cdfb2ad4f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -91,7 +91,7 @@ struct ceph_mount_options {
char *snapdir_name; /* default ".snap" */
char *mds_namespace; /* default NULL */
- char *server_path; /* default "/" */
+ char *server_path; /* default NULL (means "/") */
char *fscache_uniq; /* default NULL */
};
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606f26d862dc..cc3ada12848d 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -324,6 +324,8 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (full_path == NULL)
goto cdda_exit;
+ convert_delimiter(full_path, '\\');
+
cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
if (!cifs_sb_master_tlink(cifs_sb)) {
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 440828afcdde..716574aab3b6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -601,7 +601,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
*pmode |= (S_IXUGO & (*pbits_to_set));
- cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
+ cifs_dbg(NOISY, "access flags 0x%x mode now %04o\n", flags, *pmode);
return;
}
@@ -630,7 +630,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
if (mode & S_IXUGO)
*pace_flags |= SET_FILE_EXEC_RIGHTS;
- cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+ cifs_dbg(NOISY, "mode: %04o, access flags now 0x%x\n",
mode, *pace_flags);
return;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index febab27cd838..fa77fe5258b0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -414,7 +414,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
+ seq_puts(s, "krb5");
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
@@ -427,6 +427,10 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
if (ses->sign)
seq_puts(s, "i");
+
+ if (ses->sectype == Kerberos)
+ seq_printf(s, ",cruid=%u",
+ from_kuid_munged(&init_user_ns, ses->cred_uid));
}
static void
@@ -526,6 +530,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
if (tcon->seal)
seq_puts(s, ",seal");
+ else if (tcon->ses->server->ignore_signature)
+ seq_puts(s, ",signloosely");
if (tcon->nocase)
seq_puts(s, ",nocase");
if (tcon->local_lease)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de82cfa44b1a..0d956360e984 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1281,6 +1281,7 @@ struct cifs_fid {
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
__u8 create_guid[16];
+ __u32 access;
struct cifs_pending_open *pending_open;
unsigned int epoch;
#ifdef CONFIG_CIFS_DEBUG2
@@ -1741,6 +1742,12 @@ static inline bool is_retryable_error(int error)
return false;
}
+
+/* cifs_get_writable_file() flags */
+#define FIND_WR_ANY 0
+#define FIND_WR_FSUID_ONLY 1
+#define FIND_WR_WITH_DELETE 2
+
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
#define MID_REQUEST_SUBMITTED 2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 89eaaf46d1ca..e5cb681ec138 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -134,11 +134,12 @@ extern bool backup_cred(struct cifs_sb_info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
-extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
- bool fsuid_only,
+ int flags,
struct cifsFileInfo **ret_file);
extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+ int flags,
struct cifsFileInfo **ret_file);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3c89569e7210..6f6fb3606a5d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1492,6 +1492,7 @@ openRetry:
*oplock = rsp->OplockLevel;
/* cifs fid stays in le */
oparms->fid->netfid = rsp->Fid;
+ oparms->fid->access = desired_access;
/* Let caller know file was created so we can set the mode. */
/* Do we care about the CreateAction in any other cases? */
@@ -2115,7 +2116,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
- rc = cifs_get_writable_file(CIFS_I(inode), false,
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
&wdata2->cfile);
if (!wdata2->cfile) {
cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a941ac7a659d..4804d1df8c1c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -4151,7 +4151,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_gid = pvolume_info->linux_gid;
cifs_sb->mnt_file_mode = pvolume_info->file_mode;
cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
- cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n",
+ cifs_dbg(FYI, "file mode: %04ho dir mode: %04ho\n",
cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
cifs_sb->actimeo = pvolume_info->actimeo;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index bc9516ab4b34..3b942ecdd4be 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1958,7 +1958,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
/* Return -EBADF if no handle is found and general rc otherwise */
int
-cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags,
struct cifsFileInfo **ret_file)
{
struct cifsFileInfo *open_file, *inv_file = NULL;
@@ -1966,7 +1966,8 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
bool any_available = false;
int rc = -EBADF;
unsigned int refind = 0;
-
+ bool fsuid_only = flags & FIND_WR_FSUID_ONLY;
+ bool with_delete = flags & FIND_WR_WITH_DELETE;
*ret_file = NULL;
/*
@@ -1998,6 +1999,8 @@ refind_writable:
continue;
if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
+ if (with_delete && !(open_file->fid.access & DELETE))
+ continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -2045,12 +2048,12 @@ refind_writable:
}
struct cifsFileInfo *
-find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+find_writable_file(struct cifsInodeInfo *cifs_inode, int flags)
{
struct cifsFileInfo *cfile;
int rc;
- rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+ rc = cifs_get_writable_file(cifs_inode, flags, &cfile);
if (rc)
cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
@@ -2059,6 +2062,7 @@ find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
int
cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+ int flags,
struct cifsFileInfo **ret_file)
{
struct list_head *tmp;
@@ -2085,7 +2089,7 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
kfree(full_path);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_unlock(&tcon->open_file_lock);
- return cifs_get_writable_file(cinode, 0, ret_file);
+ return cifs_get_writable_file(cinode, flags, ret_file);
}
spin_unlock(&tcon->open_file_lock);
@@ -2162,7 +2166,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (mapping->host->i_size - offset < (loff_t)to)
to = (unsigned)(mapping->host->i_size - offset);
- rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+ rc = cifs_get_writable_file(CIFS_I(mapping->host), FIND_WR_ANY,
+ &open_file);
if (!rc) {
bytes_written = cifs_write(open_file, open_file->pid,
write_data, to - from, &offset);
@@ -2355,7 +2360,7 @@ retry:
if (cfile)
cifsFileInfo_put(cfile);
- rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
/* in case of an error store it to return later */
if (rc)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9ba623b601ec..1e8a4b1579db 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -653,8 +653,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
*/
if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
!info->DeletePending) {
- cifs_dbg(1, "bogus file nlink value %u\n",
- fattr->cf_nlink);
+ cifs_dbg(VFS, "bogus file nlink value %u\n",
+ fattr->cf_nlink);
fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
}
}
@@ -1648,7 +1648,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
struct TCP_Server_Info *server;
char *full_path;
- cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+ cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n",
mode, inode);
cifs_sb = CIFS_SB(inode->i_sb);
@@ -2073,6 +2073,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
struct inode *inode = d_inode(dentry);
struct super_block *sb = dentry->d_sb;
char *full_path = NULL;
+ int count = 0;
if (inode == NULL)
return -ENOENT;
@@ -2094,15 +2095,18 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
full_path, inode, inode->i_count.counter,
dentry, cifs_get_time(dentry), jiffies);
+again:
if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
else
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
-
+ if (rc == -EAGAIN && count++ < 10)
+ goto again;
out:
kfree(full_path);
free_xid(xid);
+
return rc;
}
@@ -2278,7 +2282,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
* writebehind data than the SMB timeout for the SetPathInfo
* request would allow
*/
- open_file = find_writable_file(cifsInode, true);
+ open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY);
if (open_file) {
tcon = tlink_tcon(open_file->tlink);
server = tcon->ses->server;
@@ -2428,7 +2432,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
args->ctime = NO_CHANGE_64;
args->device = 0;
- open_file = find_writable_file(cifsInode, true);
+ open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY);
if (open_file) {
u16 nfid = open_file->fid.netfid;
u32 npid = open_file->pid;
@@ -2531,7 +2535,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
rc = 0;
if (attrs->ia_valid & ATTR_MTIME) {
- rc = cifs_get_writable_file(cifsInode, false, &wfile);
+ rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile);
if (!rc) {
tcon = tlink_tcon(wfile->tlink);
rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index eb994e313c6a..b130efaf8feb 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -766,7 +766,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
struct cifs_tcon *tcon;
/* if the file is already open for write, just use that fileid */
- open_file = find_writable_file(cinode, true);
+ open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY);
if (open_file) {
fid.netfid = open_file->fid.netfid;
netpid = open_file->pid;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1cf207564ff9..a8c301ae00ed 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -521,7 +521,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
cifs_i = CIFS_I(inode);
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
- cifs_get_writable_path(tcon, name, &cfile);
+ cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE,
@@ -577,7 +577,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
{
struct cifsFileInfo *cfile;
- cifs_get_writable_path(tcon, from_name, &cfile);
+ cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
return smb2_set_path_attr(xid, tcon, from_name, to_name,
cifs_sb, DELETE, SMB2_OP_RENAME, cfile);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index baa825f4cec0..c31e84ee3c39 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1116,7 +1116,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
void *data[1];
struct smb2_file_full_ea_info *ea = NULL;
struct kvec close_iov[1];
- int rc;
+ struct smb2_query_info_rsp *rsp;
+ int rc, used_len = 0;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -1139,6 +1140,38 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
cifs_sb);
if (rc == -ENODATA)
goto sea_exit;
+ } else {
+ /* If we are adding a attribute we should first check
+ * if there will be enough space available to store
+ * the new EA. If not we should not add it since we
+ * would not be able to even read the EAs back.
+ */
+ rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ FILE_READ_EA,
+ FILE_FULL_EA_INFORMATION,
+ SMB2_O_INFO_FILE,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE,
+ &rsp_iov[1], &resp_buftype[1], cifs_sb);
+ if (rc == 0) {
+ rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ used_len = le32_to_cpu(rsp->OutputBufferLength);
+ }
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(&rsp_iov[1], 0, sizeof(rsp_iov[1]));
+ rc = 0;
+
+ /* Use a fudge factor of 256 bytes in case we collide
+ * with a different set_EAs command.
+ */
+ if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 <
+ used_len + ea_name_len + ea_value_len + 1) {
+ rc = -ENOSPC;
+ goto sea_exit;
+ }
}
}
@@ -1331,6 +1364,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
cfile->fid.persistent_fid = fid->persistent_fid;
cfile->fid.volatile_fid = fid->volatile_fid;
+ cfile->fid.access = fid->access;
#ifdef CONFIG_CIFS_DEBUG2
cfile->fid.mid = fid->mid;
#endif /* CIFS_DEBUG2 */
@@ -3294,7 +3328,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs
* some servers (Windows2016) will not reflect recent writes in
* QUERY_ALLOCATED_RANGES until SMB2_flush is called.
*/
- wrcfile = find_writable_file(cifsi, false);
+ wrcfile = find_writable_file(cifsi, FIND_WR_ANY);
if (wrcfile) {
filemap_write_and_wait(inode->i_mapping);
smb2_flush_file(xid, tcon, &wrcfile->fid);
@@ -4795,6 +4829,7 @@ struct smb_version_operations smb21_operations = {
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
.enum_snapshots = smb3_enum_snapshots,
+ .notify = smb3_notify,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
#ifdef CONFIG_CIFS_XATTR
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 1234f9ccab03..28c0be5e69b7 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2771,6 +2771,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
atomic_inc(&tcon->num_remote_opens);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
+ oparms->fid->access = oparms->desired_access;
#ifdef CONFIG_CIFS_DEBUG2
oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
#endif /* CIFS_DEBUG2 */
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 65cb09fa6ead..08c9f216a54d 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -539,6 +539,15 @@ int fscrypt_drop_inode(struct inode *inode)
mk = ci->ci_master_key->payload.data[0];
/*
+ * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
+ * protected by the key were cleaned by sync_filesystem(). But if
+ * userspace is still using the files, inodes can be dirtied between
+ * then and now. We mustn't lose any writes, so skip dirty inodes here.
+ */
+ if (inode->i_state & I_DIRTY_ALL)
+ return 0;
+
+ /*
* Note: since we aren't holding ->mk_secret_sem, the result here can
* immediately become outdated. But there's no correctness problem with
* unnecessarily evicting. Nor is there a correctness problem with not
diff --git a/fs/dax.c b/fs/dax.c
index 1f1f0201cad1..35da144375a0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -937,12 +937,11 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
* on persistent storage prior to completion of the operation.
*/
int dax_writeback_mapping_range(struct address_space *mapping,
- struct block_device *bdev, struct writeback_control *wbc)
+ struct dax_device *dax_dev, struct writeback_control *wbc)
{
XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
struct inode *inode = mapping->host;
pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
- struct dax_device *dax_dev;
void *entry;
int ret = 0;
unsigned int scanned = 0;
@@ -953,10 +952,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0;
- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
- if (!dax_dev)
- return -EIO;
-
trace_dax_writeback_range(inode, xas.xa_index, end_index);
tag_pages_for_writeback(mapping, xas.xa_index, end_index);
@@ -977,7 +972,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
xas_lock_irq(&xas);
}
xas_unlock_irq(&xas);
- put_dax(dax_dev);
trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
return ret;
}
@@ -1207,6 +1201,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
lockdep_assert_held(&inode->i_rwsem);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= IOMAP_NOWAIT;
+
while (iov_iter_count(iter)) {
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
iter, dax_iomap_actor);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 634b09d18b77..db987b5110a9 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1090,21 +1090,12 @@ static const struct file_operations fops_regset32 = {
* This function creates a file in debugfs with the given name that reports
* the names and values of a set of 32-bit registers. If the @mode variable
* is so set it can be read from. Writing is not supported.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
- struct dentry *parent,
- struct debugfs_regset32 *regset)
+void debugfs_create_regset32(const char *name, umode_t mode,
+ struct dentry *parent,
+ struct debugfs_regset32 *regset)
{
- return debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+ debugfs_create_file(name, mode, parent, regset, &fops_regset32);
}
EXPORT_SYMBOL_GPL(debugfs_create_regset32);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index db1ef144c63a..2c449aed1b92 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -311,8 +311,10 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
struct extent_crypt_result ecr;
int rc = 0;
- BUG_ON(!crypt_stat || !crypt_stat->tfm
- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
+ if (!crypt_stat || !crypt_stat->tfm
+ || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+ return -EINVAL;
+
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
crypt_stat->key_size);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 1c1a56be7ea2..e6ac78c62ca4 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -8,7 +8,7 @@
* Copyright (C) 2004-2008 International Business Machines Corp.
* Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
* Trevor S. Highland <trevor.highland@gmail.com>
- * Tyler Hicks <tyhicks@ou.edu>
+ * Tyler Hicks <code@tyhicks.com>
*/
#ifndef ECRYPTFS_KERNEL_H
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 7d326aa0308e..af3eb02bbca1 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1304,7 +1304,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
printk(KERN_WARNING "Tag 1 packet contains key larger "
"than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
rc = -EINVAL;
- goto out;
+ goto out_free;
}
memcpy((*new_auth_tok)->session_key.encrypted_key,
&data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2)));
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b8a7ce379ffe..e63259fdef28 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -7,7 +7,7 @@
* Copyright (C) 2004-2007 International Business Machines Corp.
* Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
* Michael C. Thompson <mcthomps@us.ibm.com>
- * Tyler Hicks <tyhicks@ou.edu>
+ * Tyler Hicks <code@tyhicks.com>
*/
#include <linux/dcache.h>
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index d668e60b85b5..8646ba76def3 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -4,7 +4,7 @@
*
* Copyright (C) 2004-2008 International Business Machines Corp.
* Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
- * Tyler Hicks <tyhicks@ou.edu>
+ * Tyler Hicks <code@tyhicks.com>
*/
#include <linux/sched.h>
#include <linux/slab.h>
@@ -379,6 +379,7 @@ int __init ecryptfs_init_messaging(void)
* ecryptfs_message_buf_len),
GFP_KERNEL);
if (!ecryptfs_msg_ctx_arr) {
+ kfree(ecryptfs_daemon_hash);
rc = -ENOMEM;
goto out;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 119667e65890..c885cf7d724b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -960,8 +960,9 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
static int
ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- return dax_writeback_mapping_range(mapping,
- mapping->host->i_sb->s_bdev, wbc);
+ struct ext2_sb_info *sbi = EXT2_SB(mapping->host->i_sb);
+
+ return dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
}
const struct address_space_operations ext2_aops = {
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 5f993a411251..8fd0b3cdab4c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -270,6 +270,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh_p;
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
@@ -280,7 +281,14 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- if (!sbi->s_group_desc[group_desc]) {
+ bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
+ /*
+ * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
+ * the pointer being dereferenced won't be dereferenced again. By
+ * looking at the usage in add_new_gdb() the value isn't modified,
+ * just the pointer, and so it remains valid.
+ */
+ if (!bh_p) {
ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
@@ -288,10 +296,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
}
desc = (struct ext4_group_desc *)(
- (__u8 *)sbi->s_group_desc[group_desc]->b_data +
+ (__u8 *)bh_p->b_data +
offset * EXT4_DESC_SIZE(sb));
if (bh)
- *bh = sbi->s_group_desc[group_desc];
+ *bh = bh_p;
return desc;
}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 1ee04e76bbe0..0a734ffb4310 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -207,6 +207,7 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
return PTR_ERR(inode);
num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
while (i < num) {
+ cond_resched();
map.m_lblk = i;
map.m_len = num - i;
n = ext4_map_blocks(NULL, inode, &map, 0);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1f340743c9a8..9aa1f75409b0 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,12 +129,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (err != ERR_BAD_DX_DIR) {
return err;
}
- /*
- * We don't set the inode dirty flag since it's not
- * critical that it get flushed back to the disk.
- */
- ext4_clear_inode_flag(file_inode(file),
- EXT4_INODE_INDEX);
+ /* Can we just clear INDEX flag to ignore htree information? */
+ if (!ext4_has_metadata_csum(sb)) {
+ /*
+ * We don't set the inode dirty flag since it's not
+ * critical that it gets flushed back to the disk.
+ */
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
if (ext4_has_inline_data(inode)) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9a2ee2428ecc..61b37a052052 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1400,7 +1400,7 @@ struct ext4_sb_info {
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
- struct buffer_head **s_group_desc;
+ struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
unsigned int s_mount_flags;
@@ -1462,7 +1462,7 @@ struct ext4_sb_info {
#endif
/* for buddy allocator */
- struct ext4_group_info ***s_group_info;
+ struct ext4_group_info ** __rcu *s_group_info;
struct inode *s_buddy_cache;
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
@@ -1512,7 +1512,7 @@ struct ext4_sb_info {
unsigned int s_extent_max_zeroout_kb;
unsigned int s_log_groups_per_flex;
- struct flex_groups *s_flex_groups;
+ struct flex_groups * __rcu *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
/* workqueue for reserved extent conversions (buffered io) */
@@ -1552,8 +1552,11 @@ struct ext4_sb_info {
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
- /* Barrier between changing inodes' journal flags and writepages ops. */
- struct percpu_rw_semaphore s_journal_flag_rwsem;
+ /*
+ * Barrier between writepages ops and changing any inode's JOURNAL_DATA
+ * or EXTENTS flag.
+ */
+ struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
@@ -1577,6 +1580,23 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
}
/*
+ * Returns: sbi->field[index]
+ * Used to access an array element from the following sbi fields which require
+ * rcu protection to avoid dereferencing an invalid pointer due to reassignment
+ * - s_group_desc
+ * - s_group_info
+ * - s_flex_group
+ */
+#define sbi_array_rcu_deref(sbi, field, index) \
+({ \
+ typeof(*((sbi)->field)) _v; \
+ rcu_read_lock(); \
+ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \
+ rcu_read_unlock(); \
+ _v; \
+})
+
+/*
* Simulate_fail codes
*/
#define EXT4_SIM_BBITMAP_EIO 1
@@ -2544,8 +2564,11 @@ void ext4_insert_dentry(struct inode *inode,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!ext4_has_feature_dir_index(inode->i_sb))
+ if (!ext4_has_feature_dir_index(inode->i_sb)) {
+ /* ext4_iget() should have caught this... */
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -2727,6 +2750,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
extern bool ext4_empty_dir(struct inode *inode);
/* resize.c */
+extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
@@ -2973,13 +2997,13 @@ static inline
struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
ext4_group_t group)
{
- struct ext4_group_info ***grp_info;
+ struct ext4_group_info **grp_info;
long indexv, indexh;
BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
- grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
- return grp_info[indexv][indexh];
+ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+ return grp_info[indexh];
}
/*
@@ -3029,7 +3053,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
!inode_is_locked(inode));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = newsize;
+ WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
up_write(&EXT4_I(inode)->i_data_sem);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c66e8f9451a2..f95ee99091e4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -328,11 +328,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
percpu_counter_inc(&sbi->s_freeinodes_counter);
if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, block_group);
+ struct flex_groups *fg;
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups,
+ ext4_flex_group(sbi, block_group));
+ atomic_inc(&fg->free_inodes);
if (is_directory)
- atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ atomic_dec(&fg->used_dirs);
}
BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
@@ -368,12 +370,13 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
int flex_size, struct orlov_stats *stats)
{
struct ext4_group_desc *desc;
- struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
if (flex_size > 1) {
- stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
- stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
- stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+ struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
+ s_flex_groups, g);
+ stats->free_inodes = atomic_read(&fg->free_inodes);
+ stats->free_clusters = atomic64_read(&fg->free_clusters);
+ stats->used_dirs = atomic_read(&fg->used_dirs);
return;
}
@@ -1054,7 +1057,8 @@ got:
if (sbi->s_log_groups_per_flex) {
ext4_group_t f = ext4_flex_group(sbi, group);
- atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ f)->used_dirs);
}
}
if (ext4_has_group_desc_csum(sb)) {
@@ -1077,7 +1081,8 @@ got:
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
- atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_inodes);
}
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3313168b680f..fa0ff78dc033 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2465,7 +2465,7 @@ update_disksize:
* truncate are avoided by checking i_size under i_data_sem.
*/
disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
- if (disksize > EXT4_I(inode)->i_disksize) {
+ if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@@ -2628,7 +2628,7 @@ static int ext4_writepages(struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- percpu_down_read(&sbi->s_journal_flag_rwsem);
+ percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
/*
@@ -2849,7 +2849,7 @@ unplug:
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_journal_flag_rwsem);
+ percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
@@ -2864,13 +2864,13 @@ static int ext4_dax_writepages(struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- percpu_down_read(&sbi->s_journal_flag_rwsem);
+ percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
+ ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_journal_flag_rwsem);
+ percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
@@ -4644,6 +4644,18 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ /*
+ * If dir_index is not enabled but there's dir with INDEX flag set,
+ * we'd normally treat htree data as empty space. But with metadata
+ * checksumming that corrupts checksums so forbid that.
+ */
+ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
+ ext4_error_inode(inode, function, line, 0,
+ "iget: Dir with htree data on filesystem without dir_index feature.");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
@@ -5849,7 +5861,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
}
}
- percpu_down_write(&sbi->s_journal_flag_rwsem);
+ percpu_down_write(&sbi->s_writepages_rwsem);
jbd2_journal_lock_updates(journal);
/*
@@ -5866,7 +5878,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = jbd2_journal_flush(journal);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_journal_flag_rwsem);
+ percpu_up_write(&sbi->s_writepages_rwsem);
return err;
}
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -5874,7 +5886,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_journal_flag_rwsem);
+ percpu_up_write(&sbi->s_writepages_rwsem);
if (val)
up_write(&EXT4_I(inode)->i_mmap_sem);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f64838187559..51a78eb65f3c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2356,7 +2356,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned size;
- struct ext4_group_info ***new_groupinfo;
+ struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
@@ -2369,13 +2369,16 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
}
- if (sbi->s_group_info) {
- memcpy(new_groupinfo, sbi->s_group_info,
+ rcu_read_lock();
+ old_groupinfo = rcu_dereference(sbi->s_group_info);
+ if (old_groupinfo)
+ memcpy(new_groupinfo, old_groupinfo,
sbi->s_group_info_size * sizeof(*sbi->s_group_info));
- kvfree(sbi->s_group_info);
- }
- sbi->s_group_info = new_groupinfo;
+ rcu_read_unlock();
+ rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ if (old_groupinfo)
+ ext4_kvfree_array_rcu(old_groupinfo);
ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
sbi->s_group_info_size);
return 0;
@@ -2387,6 +2390,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
{
int i;
int metalen = 0;
+ int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_info **meta_group_info;
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2405,12 +2409,12 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
"for a buddy group");
goto exit_meta_group_info;
}
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
- meta_group_info;
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
+ rcu_read_unlock();
}
- meta_group_info =
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
@@ -2458,8 +2462,13 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
exit_group_info:
/* If a meta_group_info table has been allocated, release it now */
if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
- kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+ struct ext4_group_info ***group_info;
+
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
+ kfree(group_info[idx]);
+ group_info[idx] = NULL;
+ rcu_read_unlock();
}
exit_meta_group_info:
return -ENOMEM;
@@ -2472,6 +2481,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
struct ext4_group_desc *desc;
+ struct ext4_group_info ***group_info;
struct kmem_cache *cachep;
err = ext4_mb_alloc_groupinfo(sb, ngroups);
@@ -2507,11 +2517,16 @@ err_freebuddy:
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
i = sbi->s_group_info_size;
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
while (i-- > 0)
- kfree(sbi->s_group_info[i]);
+ kfree(group_info[i]);
+ rcu_read_unlock();
iput(sbi->s_buddy_cache);
err_freesgi:
- kvfree(sbi->s_group_info);
+ rcu_read_lock();
+ kvfree(rcu_dereference(sbi->s_group_info));
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -2700,7 +2715,7 @@ int ext4_mb_release(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
int num_meta_group_infos;
- struct ext4_group_info *grinfo;
+ struct ext4_group_info *grinfo, ***group_info;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2719,9 +2734,12 @@ int ext4_mb_release(struct super_block *sb)
num_meta_group_infos = (ngroups +
EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
for (i = 0; i < num_meta_group_infos; i++)
- kfree(sbi->s_group_info[i]);
- kvfree(sbi->s_group_info);
+ kfree(group_info[i]);
+ kvfree(group_info);
+ rcu_read_unlock();
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
@@ -3020,7 +3038,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
atomic64_sub(ac->ac_b_ex.fe_len,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4918,7 +4937,8 @@ do_more:
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic64_add(count_clusters,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
/*
@@ -5075,7 +5095,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic64_add(clusters_freed,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 89725fa42573..fb6520f37135 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -407,6 +407,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
int ext4_ext_migrate(struct inode *inode)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
@@ -431,6 +432,8 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
+ percpu_down_write(&sbi->s_writepages_rwsem);
+
/*
* Worst case we can touch the allocation bitmaps, a bgd
* block, and a block to link in the orphan list. We do need
@@ -441,7 +444,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- return retval;
+ goto out_unlock;
}
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
@@ -452,7 +455,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
- return retval;
+ goto out_unlock;
}
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -494,7 +497,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
- goto out;
+ goto out_tmp_inode;
}
ei = EXT4_I(inode);
@@ -576,10 +579,11 @@ err_out:
ext4_ext_tree_init(handle, tmp_inode);
out_stop:
ext4_journal_stop(handle);
-out:
+out_tmp_inode:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
-
+out_unlock:
+ percpu_up_write(&sbi->s_writepages_rwsem);
return retval;
}
@@ -589,7 +593,8 @@ out:
int ext4_ind_migrate(struct inode *inode)
{
struct ext4_extent_header *eh;
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_super_block *es = sbi->s_es;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent *ex;
unsigned int i, len;
@@ -613,9 +618,13 @@ int ext4_ind_migrate(struct inode *inode)
if (test_opt(inode->i_sb, DELALLOC))
ext4_alloc_da_blocks(inode);
+ percpu_down_write(&sbi->s_writepages_rwsem);
+
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_unlock;
+ }
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);
@@ -650,5 +659,7 @@ int ext4_ind_migrate(struct inode *inode)
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
+out_unlock:
+ percpu_up_write(&sbi->s_writepages_rwsem);
return ret;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 1c44b1a32001..87f7551c5132 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -120,10 +120,10 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
{
__ext4_warning(sb, function, line, "%s", msg);
__ext4_warning(sb, function, line,
- "MMP failure info: last update time: %llu, last update "
- "node: %s, last update device: %s",
- (long long unsigned int) le64_to_cpu(mmp->mmp_time),
- mmp->mmp_nodename, mmp->mmp_bdevname);
+ "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
+ (unsigned long long)le64_to_cpu(mmp->mmp_time),
+ (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
+ (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
}
/*
@@ -154,6 +154,7 @@ static int kmmpd(void *data)
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
@@ -379,7 +380,8 @@ skip:
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ (int)sizeof(mmp->mmp_bdevname),
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 129d2ebae00d..b05ea72f38fd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1511,6 +1511,7 @@ restart:
/*
* We deal with the read-ahead logic here.
*/
+ cond_resched();
if (ra_ptr >= ra_max) {
/* Refill the readahead buffer */
ra_ptr = 0;
@@ -2213,6 +2214,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, &fname, dir, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
+ /* Can we just ignore htree data? */
+ if (ext4_has_metadata_csum(sb)) {
+ EXT4_ERROR_INODE(dir,
+ "Directory has corrupted htree index.");
+ retval = -EFSCORRUPTED;
+ goto out;
+ }
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 86a2500ed292..a50b51270ea9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -17,6 +17,33 @@
#include "ext4_jbd2.h"
+struct ext4_rcu_ptr {
+ struct rcu_head rcu;
+ void *ptr;
+};
+
+static void ext4_rcu_ptr_callback(struct rcu_head *head)
+{
+ struct ext4_rcu_ptr *ptr;
+
+ ptr = container_of(head, struct ext4_rcu_ptr, rcu);
+ kvfree(ptr->ptr);
+ kfree(ptr);
+}
+
+void ext4_kvfree_array_rcu(void *to_free)
+{
+ struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+
+ if (ptr) {
+ ptr->ptr = to_free;
+ call_rcu(&ptr->rcu, ext4_rcu_ptr_callback);
+ return;
+ }
+ synchronize_rcu();
+ kvfree(to_free);
+}
+
int ext4_resize_begin(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -542,8 +569,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
brelse(gdb);
goto out;
}
- memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
- gdb->b_size);
+ memcpy(gdb->b_data, sbi_array_rcu_deref(sbi,
+ s_group_desc, j)->b_data, gdb->b_size);
set_buffer_uptodate(gdb);
err = ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -860,13 +887,15 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
}
brelse(dind);
- o_group_desc = EXT4_SB(sb)->s_group_desc;
+ rcu_read_lock();
+ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ rcu_read_unlock();
n_group_desc[gdb_num] = gdb_bh;
- EXT4_SB(sb)->s_group_desc = n_group_desc;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
EXT4_SB(sb)->s_gdb_count++;
- kvfree(o_group_desc);
+ ext4_kvfree_array_rcu(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
err = ext4_handle_dirty_super(handle, sb);
@@ -909,9 +938,11 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
return err;
}
- o_group_desc = EXT4_SB(sb)->s_group_desc;
+ rcu_read_lock();
+ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ rcu_read_unlock();
n_group_desc[gdb_num] = gdb_bh;
BUFFER_TRACE(gdb_bh, "get_write_access");
@@ -922,9 +953,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
return err;
}
- EXT4_SB(sb)->s_group_desc = n_group_desc;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
EXT4_SB(sb)->s_gdb_count++;
- kvfree(o_group_desc);
+ ext4_kvfree_array_rcu(o_group_desc);
return err;
}
@@ -1188,7 +1219,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
* use non-sparse filesystems anymore. This is already checked above.
*/
if (gdb_off) {
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+ gdb_num);
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
@@ -1270,7 +1302,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
/*
* get_write_access() has been called on gdb_bh by ext4_add_new_desc().
*/
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -1398,11 +1430,14 @@ static void ext4_update_super(struct super_block *sb,
percpu_counter_read(&sbi->s_freeclusters_counter));
if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
+ struct flex_groups *fg;
+
flex_group = ext4_flex_group(sbi, group_data[0].group);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &fg->free_clusters);
atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
- &sbi->s_flex_groups[flex_group].free_inodes);
+ &fg->free_inodes);
}
/*
@@ -1497,7 +1532,8 @@ exit_journal:
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+ gdb_num);
if (old_gdb == gdb_bh->b_blocknr)
continue;
update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8434217549b3..0c7c4adb664e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1014,6 +1014,8 @@ static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head **group_desc;
+ struct flex_groups **flex_groups;
int aborted = 0;
int i, err;
@@ -1046,15 +1048,23 @@ static void ext4_put_super(struct super_block *sb)
if (!sb_rdonly(sb))
ext4_commit_super(sb, 1);
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
for (i = 0; i < sbi->s_gdb_count; i++)
- brelse(sbi->s_group_desc[i]);
- kvfree(sbi->s_group_desc);
- kvfree(sbi->s_flex_groups);
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ flex_groups = rcu_dereference(sbi->s_flex_groups);
+ if (flex_groups) {
+ for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+ kvfree(flex_groups[i]);
+ kvfree(flex_groups);
+ }
+ rcu_read_unlock();
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+ percpu_free_rwsem(&sbi->s_writepages_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
@@ -2380,8 +2390,8 @@ done:
int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct flex_groups *new_groups;
- int size;
+ struct flex_groups **old_groups, **new_groups;
+ int size, i, j;
if (!sbi->s_log_groups_per_flex)
return 0;
@@ -2390,22 +2400,37 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
if (size <= sbi->s_flex_groups_allocated)
return 0;
- size = roundup_pow_of_two(size * sizeof(struct flex_groups));
- new_groups = kvzalloc(size, GFP_KERNEL);
+ new_groups = kvzalloc(roundup_pow_of_two(size *
+ sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
if (!new_groups) {
- ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
- size / (int) sizeof(struct flex_groups));
+ ext4_msg(sb, KERN_ERR,
+ "not enough memory for %d flex group pointers", size);
return -ENOMEM;
}
-
- if (sbi->s_flex_groups) {
- memcpy(new_groups, sbi->s_flex_groups,
- (sbi->s_flex_groups_allocated *
- sizeof(struct flex_groups)));
- kvfree(sbi->s_flex_groups);
+ for (i = sbi->s_flex_groups_allocated; i < size; i++) {
+ new_groups[i] = kvzalloc(roundup_pow_of_two(
+ sizeof(struct flex_groups)),
+ GFP_KERNEL);
+ if (!new_groups[i]) {
+ for (j = sbi->s_flex_groups_allocated; j < i; j++)
+ kvfree(new_groups[j]);
+ kvfree(new_groups);
+ ext4_msg(sb, KERN_ERR,
+ "not enough memory for %d flex groups", size);
+ return -ENOMEM;
+ }
}
- sbi->s_flex_groups = new_groups;
- sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+ rcu_read_lock();
+ old_groups = rcu_dereference(sbi->s_flex_groups);
+ if (old_groups)
+ memcpy(new_groups, old_groups,
+ (sbi->s_flex_groups_allocated *
+ sizeof(struct flex_groups *)));
+ rcu_read_unlock();
+ rcu_assign_pointer(sbi->s_flex_groups, new_groups);
+ sbi->s_flex_groups_allocated = size;
+ if (old_groups)
+ ext4_kvfree_array_rcu(old_groups);
return 0;
}
@@ -2413,6 +2438,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
+ struct flex_groups *fg;
ext4_group_t flex_group;
int i, err;
@@ -2430,12 +2456,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_add(ext4_free_inodes_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_inodes);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
+ atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
atomic64_add(ext4_free_group_clusters(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_clusters);
- atomic_add(ext4_used_dirs_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].used_dirs);
+ &fg->free_clusters);
+ atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
}
return 1;
@@ -3009,17 +3034,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_QUOTA
- if (ext4_has_feature_quota(sb) && !readonly) {
+#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
+ if (!readonly && (ext4_has_feature_quota(sb) ||
+ ext4_has_feature_project(sb))) {
ext4_msg(sb, KERN_ERR,
- "Filesystem with quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
- return 0;
- }
- if (ext4_has_feature_project(sb) && !readonly) {
- ext4_msg(sb, KERN_ERR,
- "Filesystem with project quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
+ "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
return 0;
}
#endif /* CONFIG_QUOTA */
@@ -3640,9 +3659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
char *orig_data = kstrdup(data, GFP_KERNEL);
- struct buffer_head *bh;
+ struct buffer_head *bh, **group_desc;
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ struct flex_groups **flex_groups;
ext4_fsblk_t block;
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
@@ -3814,6 +3834,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+ blocksize > EXT4_MAX_BLOCK_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported filesystem blocksize %d (%d log_block_size)",
+ blocksize, le32_to_cpu(es->s_log_block_size));
+ goto failed_mount;
+ }
+
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
@@ -3831,6 +3860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_ERR,
"unsupported inode size: %d",
sbi->s_inode_size);
+ ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
goto failed_mount;
}
/*
@@ -4033,14 +4063,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
goto failed_mount;
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (blocksize < EXT4_MIN_BLOCK_SIZE ||
- blocksize > EXT4_MAX_BLOCK_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported filesystem blocksize %d (%d log_block_size)",
- blocksize, le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
if (le32_to_cpu(es->s_log_block_size) >
(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
ext4_msg(sb, KERN_ERR,
@@ -4294,9 +4316,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
}
- sbi->s_group_desc = kvmalloc_array(db_count,
- sizeof(struct buffer_head *),
- GFP_KERNEL);
+ rcu_assign_pointer(sbi->s_group_desc,
+ kvmalloc_array(db_count,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL));
if (sbi->s_group_desc == NULL) {
ext4_msg(sb, KERN_ERR, "not enough memory");
ret = -ENOMEM;
@@ -4312,14 +4335,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
for (i = 0; i < db_count; i++) {
+ struct buffer_head *bh;
+
block = descriptor_loc(sb, logical_sb_block, i);
- sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
- if (!sbi->s_group_desc[i]) {
+ bh = sb_bread_unmovable(sb, block);
+ if (!bh) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
db_count = i;
goto failed_mount2;
}
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_desc)[i] = bh;
+ rcu_read_unlock();
}
sbi->s_gdb_count = db_count;
if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
@@ -4598,7 +4626,7 @@ no_journal:
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
if (!err)
- err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+ err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -4686,13 +4714,19 @@ failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
ext4_mb_release(sb);
- if (sbi->s_flex_groups)
- kvfree(sbi->s_flex_groups);
+ rcu_read_lock();
+ flex_groups = rcu_dereference(sbi->s_flex_groups);
+ if (flex_groups) {
+ for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+ kvfree(flex_groups[i]);
+ kvfree(flex_groups);
+ }
+ rcu_read_unlock();
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+ percpu_free_rwsem(&sbi->s_writepages_rwsem);
failed_mount5:
ext4_ext_release(sb);
ext4_release_system_zone(sb);
@@ -4721,9 +4755,12 @@ failed_mount3:
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
for (i = 0; i < db_count; i++)
- brelse(sbi->s_group_desc[i]);
- kvfree(sbi->s_group_desc);
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
@@ -5585,10 +5622,7 @@ static int ext4_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = 0;
- if (dquot->dq_dqb.dqb_bsoftlimit &&
- (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit))
- limit = dquot->dq_dqb.dqb_bsoftlimit;
+ limit = dquot->dq_dqb.dqb_bsoftlimit;
if (dquot->dq_dqb.dqb_bhardlimit &&
(!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
limit = dquot->dq_dqb.dqb_bhardlimit;
@@ -5603,10 +5637,7 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = 0;
- if (dquot->dq_dqb.dqb_isoftlimit &&
- (!limit || dquot->dq_dqb.dqb_isoftlimit < limit))
- limit = dquot->dq_dqb.dqb_isoftlimit;
+ limit = dquot->dq_dqb.dqb_isoftlimit;
if (dquot->dq_dqb.dqb_ihardlimit &&
(!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
limit = dquot->dq_dqb.dqb_ihardlimit;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 594b05ae16c9..71946da84388 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -750,6 +750,13 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
return NULL;
init_rwsem(&ei->truncate_lock);
+ /* Zeroing to allow iput() even if partial initialized inode. */
+ ei->mmu_private = 0;
+ ei->i_start = 0;
+ ei->i_logstart = 0;
+ ei->i_attrs = 0;
+ ei->i_pos = 0;
+
return &ei->vfs_inode;
}
@@ -1374,16 +1381,6 @@ out:
return 0;
}
-static void fat_dummy_inode_init(struct inode *inode)
-{
- /* Initialize this dummy inode to work as no-op. */
- MSDOS_I(inode)->mmu_private = 0;
- MSDOS_I(inode)->i_start = 0;
- MSDOS_I(inode)->i_logstart = 0;
- MSDOS_I(inode)->i_attrs = 0;
- MSDOS_I(inode)->i_pos = 0;
-}
-
static int fat_read_root(struct inode *inode)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
@@ -1844,13 +1841,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
fat_inode = new_inode(sb);
if (!fat_inode)
goto out_fail;
- fat_dummy_inode_init(fat_inode);
sbi->fat_inode = fat_inode;
fsinfo_inode = new_inode(sb);
if (!fsinfo_inode)
goto out_fail;
- fat_dummy_inode_init(fsinfo_inode);
fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
sbi->fsinfo_inode = fsinfo_inode;
insert_inode_hash(fsinfo_inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9bc167562ee8..2e4c0fa2074b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -735,8 +735,9 @@ static void send_sigio_to_task(struct task_struct *p,
return;
switch (signum) {
- kernel_siginfo_t si;
- default:
+ default: {
+ kernel_siginfo_t si;
+
/* Queue a rt signal with the appropriate fd as its
value. We use SI_SIGIO as the source, not
SI_KERNEL, since kernel signals always get
@@ -769,6 +770,7 @@ static void send_sigio_to_task(struct task_struct *p,
si.si_fd = fd;
if (!do_send_sig_info(signum, &si, p, type))
break;
+ }
/* fall-through - fall back on the old plain SIGIO signal */
case 0:
do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cb60a42b9fdf..5cef075c0b37 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
+#include <linux/fs_struct.h>
#include "io-wq.h"
@@ -59,6 +60,7 @@ struct io_worker {
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
+ struct fs_struct *restore_fs;
};
#if BITS_PER_LONG == 64
@@ -151,6 +153,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
task_unlock(current);
}
+ if (current->fs != worker->restore_fs)
+ current->fs = worker->restore_fs;
+
/*
* If we have an active mm, we need to drop the wq lock before unusing
* it. If we do, return true and let the caller retry the idle loop.
@@ -311,6 +316,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
worker->restore_files = current->files;
+ worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker);
}
@@ -481,6 +487,8 @@ next:
current->files = work->files;
task_unlock(current);
}
+ if (work->fs && current->fs != work->fs)
+ current->fs = work->fs;
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
@@ -494,7 +502,7 @@ next:
if (worker->mm)
work->flags |= IO_WQ_WORK_HAS_MM;
- if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) {
+ if (wq->get_work) {
put_work = work;
wq->get_work(work);
}
@@ -527,42 +535,23 @@ next:
} while (1);
}
-static inline void io_worker_spin_for_work(struct io_wqe *wqe)
-{
- int i = 0;
-
- while (++i < 1000) {
- if (io_wqe_run_queue(wqe))
- break;
- if (need_resched())
- break;
- cpu_relax();
- }
-}
-
static int io_wqe_worker(void *data)
{
struct io_worker *worker = data;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
- bool did_work;
io_worker_start(wqe, worker);
- did_work = false;
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
- if (did_work)
- io_worker_spin_for_work(wqe);
spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
- did_work = true;
goto loop;
}
- did_work = false;
/* drops the lock on success, retry */
if (__io_worker_idle(wqe, worker)) {
__release(&wqe->lock);
@@ -691,11 +680,16 @@ static int io_wq_manager(void *data)
/* create fixed workers */
refcount_set(&wq->refs, workers_to_create);
for_each_node(node) {
+ if (!node_online(node))
+ continue;
if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
goto err;
workers_to_create--;
}
+ while (workers_to_create--)
+ refcount_dec(&wq->refs);
+
complete(&wq->done);
while (!kthread_should_stop()) {
@@ -703,6 +697,9 @@ static int io_wq_manager(void *data)
struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
+ if (!node_online(node))
+ continue;
+
spin_lock_irq(&wqe->lock);
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
fork_worker[IO_WQ_ACCT_BOUND] = true;
@@ -750,6 +747,17 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true;
}
+static void io_run_cancel(struct io_wq_work *work)
+{
+ do {
+ struct io_wq_work *old_work = work;
+
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ work = (work == old_work) ? NULL : work;
+ } while (work);
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -763,8 +771,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work);
return;
}
@@ -821,7 +828,9 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
if (io_worker_get(worker)) {
- ret = func(worker, data);
+ /* no task if node is/was offline */
+ if (worker->task)
+ ret = func(worker, data);
io_worker_release(worker);
if (ret)
break;
@@ -901,8 +910,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work);
return IO_WQ_CANCEL_OK;
}
@@ -929,17 +937,19 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
return ret;
}
+struct work_match {
+ bool (*fn)(struct io_wq_work *, void *data);
+ void *data;
+};
+
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
- struct io_wq_work *work = data;
+ struct work_match *match = data;
unsigned long flags;
bool ret = false;
- if (worker->cur_work != work)
- return false;
-
spin_lock_irqsave(&worker->lock, flags);
- if (worker->cur_work == work &&
+ if (match->fn(worker->cur_work, match->data) &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
@@ -950,15 +960,13 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
}
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
- struct io_wq_work *cwork)
+ struct work_match *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
- cwork->flags |= IO_WQ_WORK_CANCEL;
-
/*
* First check pending list, if we're lucky we can just remove it
* from there. CANCEL_OK means that the work is returned as-new,
@@ -968,7 +976,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
- if (work == cwork) {
+ if (match->fn(work, match->data)) {
wq_node_del(&wqe->work_list, node, prev);
found = true;
break;
@@ -977,8 +985,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work);
return IO_WQ_CANCEL_OK;
}
@@ -989,20 +996,31 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
* completion will run normally in this case.
*/
rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork);
+ found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
rcu_read_unlock();
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
+static bool io_wq_work_match(struct io_wq_work *work, void *data)
+{
+ return work == data;
+}
+
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
{
+ struct work_match match = {
+ .fn = io_wq_work_match,
+ .data = cwork
+ };
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
+ cwork->flags |= IO_WQ_WORK_CANCEL;
+
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- ret = io_wqe_cancel_work(wqe, cwork);
+ ret = io_wqe_cancel_work(wqe, &match);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
@@ -1010,38 +1028,33 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
return ret;
}
-struct io_wq_flush_data {
- struct io_wq_work work;
- struct completion done;
-};
-
-static void io_wq_flush_func(struct io_wq_work **workptr)
+static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{
- struct io_wq_work *work = *workptr;
- struct io_wq_flush_data *data;
+ pid_t pid = (pid_t) (unsigned long) data;
- data = container_of(work, struct io_wq_flush_data, work);
- complete(&data->done);
+ if (work)
+ return work->task_pid == pid;
+ return false;
}
-/*
- * Doesn't wait for previously queued work to finish. When this completes,
- * it just means that previously queued work was started.
- */
-void io_wq_flush(struct io_wq *wq)
+enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
{
- struct io_wq_flush_data data;
+ struct work_match match = {
+ .fn = io_wq_pid_match,
+ .data = (void *) (unsigned long) pid
+ };
+ enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- init_completion(&data.done);
- INIT_IO_WORK(&data.work, io_wq_flush_func);
- data.work.flags |= IO_WQ_WORK_INTERNAL;
- io_wqe_enqueue(wqe, &data.work);
- wait_for_completion(&data.done);
+ ret = io_wqe_cancel_work(wqe, &match);
+ if (ret != IO_WQ_CANCEL_NOTFOUND)
+ break;
}
+
+ return ret;
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
@@ -1067,12 +1080,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
for_each_node(node) {
struct io_wqe *wqe;
+ int alloc_node = node;
- wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
+ if (!node_online(alloc_node))
+ alloc_node = NUMA_NO_NODE;
+ wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
if (!wqe)
goto err;
wq->wqes[node] = wqe;
- wqe->node = node;
+ wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
if (wq->user) {
@@ -1080,7 +1096,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
task_rlimit(current, RLIMIT_NPROC);
}
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
- wqe->node = node;
wqe->wq = wq;
spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 50b3378febf2..e5e15f2c93ec 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -8,7 +8,6 @@ enum {
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
@@ -74,18 +73,15 @@ struct io_wq_work {
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
+ struct fs_struct *fs;
unsigned flags;
+ pid_t task_pid;
};
-#define INIT_IO_WORK(work, _func) \
- do { \
- (work)->list.next = NULL; \
- (work)->func = _func; \
- (work)->flags = 0; \
- (work)->files = NULL; \
- (work)->mm = NULL; \
- (work)->creds = NULL; \
- } while (0) \
+#define INIT_IO_WORK(work, _func) \
+ do { \
+ *(work) = (struct io_wq_work){ .func = _func }; \
+ } while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
@@ -103,10 +99,10 @@ void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
-void io_wq_flush(struct io_wq *wq);
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
+enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77f22c3da30f..c06082bb039a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -75,6 +75,7 @@
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -182,19 +183,15 @@ struct fixed_file_table {
struct file **files;
};
-enum {
- FFD_F_ATOMIC,
-};
-
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
struct percpu_ref refs;
struct llist_head put_llist;
- unsigned long state;
struct work_struct ref_work;
struct completion done;
+ struct rcu_head rcu;
};
struct io_ring_ctx {
@@ -204,11 +201,11 @@ struct io_ring_ctx {
struct {
unsigned int flags;
- int compat: 1;
- int account_mem: 1;
- int cq_overflow_flushed: 1;
- int drain_next: 1;
- int eventfd_async: 1;
+ unsigned int compat: 1;
+ unsigned int account_mem: 1;
+ unsigned int cq_overflow_flushed: 1;
+ unsigned int drain_next: 1;
+ unsigned int eventfd_async: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -441,6 +438,7 @@ struct io_async_msghdr {
struct iovec *iov;
struct sockaddr __user *uaddr;
struct msghdr msg;
+ struct sockaddr_storage addr;
};
struct io_async_rw {
@@ -450,17 +448,12 @@ struct io_async_rw {
ssize_t size;
};
-struct io_async_open {
- struct filename *filename;
-};
-
struct io_async_ctx {
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
struct io_async_connect connect;
struct io_timeout_data timeout;
- struct io_async_open open;
};
};
@@ -483,6 +476,8 @@ enum {
REQ_F_MUST_PUNT_BIT,
REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
+ REQ_F_NEED_CLEANUP_BIT,
+ REQ_F_OVERFLOW_BIT,
};
enum {
@@ -521,6 +516,10 @@ enum {
REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
+ /* needs cleanup */
+ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
+ /* in overflow list */
+ REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
};
/*
@@ -553,7 +552,6 @@ struct io_kiocb {
* llist_node is only used for poll deferred completions
*/
struct llist_node llist_node;
- bool has_user;
bool in_async;
bool needs_fixed_file;
u8 opcode;
@@ -614,6 +612,8 @@ struct io_op_def {
unsigned not_supported : 1;
/* needs file table */
unsigned file_table : 1;
+ /* needs ->fs */
+ unsigned needs_fs : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -656,12 +656,14 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .needs_fs = 1,
},
[IORING_OP_RECVMSG] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .needs_fs = 1,
},
[IORING_OP_TIMEOUT] = {
.async_ctx = 1,
@@ -692,6 +694,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
+ .needs_fs = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
@@ -705,6 +708,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.fd_non_neg = 1,
+ .needs_fs = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
@@ -736,6 +740,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
+ .needs_fs = 1,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
@@ -754,6 +759,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
static void io_ring_file_ref_flush(struct fixed_file_data *data);
+static void io_cleanup_req(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -909,6 +915,18 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
}
if (!req->work.creds)
req->work.creds = get_current_cred();
+ if (!req->work.fs && def->needs_fs) {
+ spin_lock(&current->fs->lock);
+ if (!current->fs->in_exec) {
+ req->work.fs = current->fs;
+ req->work.fs->users++;
+ } else {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ }
+ spin_unlock(&current->fs->lock);
+ }
+ if (!req->work.task_pid)
+ req->work.task_pid = task_pid_vnr(current);
}
static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -921,6 +939,16 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
put_cred(req->work.creds);
req->work.creds = NULL;
}
+ if (req->work.fs) {
+ struct fs_struct *fs = req->work.fs;
+
+ spin_lock(&req->work.fs->lock);
+ if (--fs->users)
+ fs = NULL;
+ spin_unlock(&req->work.fs->lock);
+ if (fs)
+ free_fs_struct(fs);
+ }
}
static inline bool io_prep_async_work(struct io_kiocb *req,
@@ -972,6 +1000,7 @@ static void io_kill_timeout(struct io_kiocb *req)
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
+ req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
io_put_req(req);
}
@@ -1074,6 +1103,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
list);
list_move(&req->list, &list);
+ req->flags &= ~REQ_F_OVERFLOW;
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
@@ -1126,6 +1156,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
}
+ req->flags |= REQ_F_OVERFLOW;
refcount_inc(&req->refs);
req->result = res;
list_add_tail(&req->list, &ctx->cq_overflow_list);
@@ -1226,6 +1257,9 @@ static void __io_req_aux_free(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ io_cleanup_req(req);
+
kfree(req->io);
if (req->file) {
if (req->flags & REQ_F_FIXED_FILE)
@@ -1446,10 +1480,10 @@ static void io_free_req(struct io_kiocb *req)
__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
- io_req_find_next(req, nxtptr);
-
- if (refcount_dec_and_test(&req->refs))
+ if (refcount_dec_and_test(&req->refs)) {
+ io_req_find_next(req, nxtptr);
__io_free_req(req);
+ }
}
static void io_put_req(struct io_kiocb *req)
@@ -1635,11 +1669,17 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
{
int iters = 0, ret = 0;
+ /*
+ * We disallow the app entering submit/complete with polling, but we
+ * still need to lock the ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
do {
int tmin = 0;
@@ -1675,21 +1715,6 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
ret = 0;
} while (min && !*nr_events && !need_resched());
- return ret;
-}
-
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
-{
- int ret;
-
- /*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
- ret = __io_iopoll_check(ctx, nr_events, min);
mutex_unlock(&ctx->uring_lock);
return ret;
}
@@ -1793,6 +1818,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add(&req->list, &ctx->poll_list);
else
list_add_tail(&req->list, &ctx->poll_list);
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ wq_has_sleeper(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
}
static void io_file_put(struct io_submit_state *state)
@@ -2043,7 +2072,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
ssize_t ret;
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret;
+ return ret < 0 ? ret : sqe_len;
}
if (req->io) {
@@ -2056,9 +2085,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return iorw->size;
}
- if (!req->has_user)
- return -EFAULT;
-
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
@@ -2137,6 +2163,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
req->io->rw.iov = req->io->rw.fast_iov;
memcpy(req->io->rw.iov, fast_iov,
sizeof(struct iovec) * iter->nr_segs);
+ } else {
+ req->flags |= REQ_F_NEED_CLEANUP;
}
}
@@ -2148,17 +2176,6 @@ static int io_alloc_async_ctx(struct io_kiocb *req)
return req->io == NULL;
}
-static void io_rw_async(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct iovec *iov = NULL;
-
- if (req->io->rw.iov != req->io->rw.fast_iov)
- iov = req->io->rw.iov;
- io_wq_submit_work(workptr);
- kfree(iov);
-}
-
static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
@@ -2171,7 +2188,6 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
io_req_map_rw(req, io_size, iovec, fast_iov, iter);
}
- req->work.func = io_rw_async;
return 0;
}
@@ -2189,7 +2205,8 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
- if (!req->io)
+ /* either don't need iovec imported or already have it */
+ if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
io = req->io;
@@ -2258,8 +2275,8 @@ copy_iov:
}
}
out_free:
- if (!io_wq_current_is_worker())
- kfree(iovec);
+ kfree(iovec);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
return ret;
}
@@ -2277,7 +2294,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- if (!req->io)
+ /* either don't need iovec imported or already have it */
+ if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
io = req->io;
@@ -2352,6 +2370,12 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
ret2 = call_write_iter(req->file, kiocb, &iter);
else
ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+ /*
+ * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
+ * retry them without IOCB_NOWAIT.
+ */
+ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+ ret2 = -EAGAIN;
if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
} else {
@@ -2364,8 +2388,8 @@ copy_iov:
}
}
out_free:
- if (!io_wq_current_is_worker())
- kfree(iovec);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ kfree(iovec);
return ret;
}
@@ -2485,6 +2509,9 @@ static void io_fallocate_finish(struct io_wq_work **workptr)
struct io_kiocb *nxt = NULL;
int ret;
+ if (io_req_cancelled(req))
+ return;
+
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
if (ret < 0)
@@ -2534,6 +2561,10 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (sqe->flags & IOSQE_FIXED_FILE)
+ return -EBADF;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
req->open.dfd = READ_ONCE(sqe->fd);
req->open.how.mode = READ_ONCE(sqe->len);
@@ -2547,6 +2578,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -2559,6 +2591,10 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (sqe->flags & IOSQE_FIXED_FILE)
+ return -EBADF;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
req->open.dfd = READ_ONCE(sqe->fd);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -2583,6 +2619,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -2614,6 +2651,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
}
err:
putname(req->open.filename);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
@@ -2754,6 +2792,10 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (sqe->flags & IOSQE_FIXED_FILE)
+ return -EBADF;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
req->open.dfd = READ_ONCE(sqe->fd);
req->open.mask = READ_ONCE(sqe->len);
@@ -2771,6 +2813,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -2808,6 +2851,7 @@ retry:
ret = cp_statx(&stat, ctx->buffer);
err:
putname(ctx->filename);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
@@ -2827,7 +2871,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (sqe->flags & IOSQE_FIXED_FILE)
- return -EINVAL;
+ return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
if (req->file->f_op == &io_uring_fops ||
@@ -2837,24 +2881,26 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
+/* only called when __close_fd_get_file() is done */
+static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
+{
+ int ret;
+
+ ret = filp_close(req->close.put_file, req->work.files);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ fput(req->close.put_file);
+ io_put_req_find_next(req, nxt);
+}
+
static void io_close_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
- /* Invoked with files, we need to do the close */
- if (req->work.files) {
- int ret;
-
- ret = filp_close(req->close.put_file, req->work.files);
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- }
-
- fput(req->close.put_file);
-
- io_put_req_find_next(req, &nxt);
+ /* not cancellable, don't do io_req_cancelled() */
+ __io_close_finish(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
}
@@ -2877,22 +2923,8 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
* No ->flush(), safely close from here and just punt the
* fput() to async context.
*/
- ret = filp_close(req->close.put_file, current->files);
-
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
-
- if (io_wq_current_is_worker()) {
- struct io_wq_work *old_work, *work;
-
- old_work = work = &req->work;
- io_close_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
- return 0;
- }
-
+ __io_close_finish(req, nxt);
+ return 0;
eagain:
req->work.func = io_close_finish;
/*
@@ -2960,35 +2992,34 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
return 0;
}
-#if defined(CONFIG_NET)
-static void io_sendrecv_async(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct iovec *iov = NULL;
-
- if (req->io->rw.iov != req->io->rw.fast_iov)
- iov = req->io->msg.iov;
- io_wq_submit_work(workptr);
- kfree(iov);
-}
-#endif
-
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
+ int ret;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+
if (!io || req->opcode == IORING_OP_SEND)
return 0;
+ /* iovec is already imported */
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
io->msg.iov = io->msg.fast_iov;
- return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
+ ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
#else
return -EOPNOTSUPP;
#endif
@@ -3008,12 +3039,11 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
- struct sockaddr_storage addr;
unsigned flags;
if (req->io) {
kmsg = &req->io->msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &req->io->msg.addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
@@ -3022,7 +3052,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
struct io_sr_msg *sr = &req->sr_msg;
kmsg = &io.msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &io.msg.addr;
io.msg.iov = io.msg.fast_iov;
ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
@@ -3041,18 +3071,22 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
if (force_nonblock && ret == -EAGAIN) {
if (req->io)
return -EAGAIN;
- if (io_alloc_async_ctx(req))
+ if (io_alloc_async_ctx(req)) {
+ if (kmsg->iov != kmsg->fast_iov)
+ kfree(kmsg->iov);
return -ENOMEM;
+ }
+ req->flags |= REQ_F_NEED_CLEANUP;
memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
- req->work.func = io_sendrecv_async;
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
- if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
+ if (kmsg && kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
@@ -3120,17 +3154,29 @@ static int io_recvmsg_prep(struct io_kiocb *req,
#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
+ int ret;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+
if (!io || req->opcode == IORING_OP_RECV)
return 0;
+ /* iovec is already imported */
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
io->msg.iov = io->msg.fast_iov;
- return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
+ ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.uaddr, &io->msg.iov);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
#else
return -EOPNOTSUPP;
#endif
@@ -3150,12 +3196,11 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
- struct sockaddr_storage addr;
unsigned flags;
if (req->io) {
kmsg = &req->io->msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &req->io->msg.addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
@@ -3164,7 +3209,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
struct io_sr_msg *sr = &req->sr_msg;
kmsg = &io.msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &io.msg.addr;
io.msg.iov = io.msg.fast_iov;
ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
@@ -3185,18 +3230,22 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
if (force_nonblock && ret == -EAGAIN) {
if (req->io)
return -EAGAIN;
- if (io_alloc_async_ctx(req))
+ if (io_alloc_async_ctx(req)) {
+ if (kmsg->iov != kmsg->fast_iov)
+ kfree(kmsg->iov);
return -ENOMEM;
+ }
memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
- req->work.func = io_sendrecv_async;
+ req->flags |= REQ_F_NEED_CLEANUP;
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
- if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
+ if (kmsg && kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
@@ -4207,6 +4256,35 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EIOCBQUEUED;
}
+static void io_cleanup_req(struct io_kiocb *req)
+{
+ struct io_async_ctx *io = req->io;
+
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ if (io->rw.iov != io->rw.fast_iov)
+ kfree(io->rw.iov);
+ break;
+ case IORING_OP_SENDMSG:
+ case IORING_OP_RECVMSG:
+ if (io->msg.iov != io->msg.fast_iov)
+ kfree(io->msg.iov);
+ break;
+ case IORING_OP_OPENAT:
+ case IORING_OP_OPENAT2:
+ case IORING_OP_STATX:
+ putname(req->open.filename);
+ break;
+ }
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+}
+
static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_kiocb **nxt, bool force_nonblock)
{
@@ -4446,7 +4524,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
}
if (!ret) {
- req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
req->in_async = true;
do {
ret = io_issue_sqe(req, NULL, &nxt, false);
@@ -4479,7 +4556,7 @@ static int io_req_needs_file(struct io_kiocb *req, int fd)
{
if (!io_op_defs[req->opcode].needs_file)
return 0;
- if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
+ if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
return 0;
return 1;
}
@@ -4639,11 +4716,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt = NULL;
+ const struct cred *old_creds = NULL;
int ret;
again:
linked_timeout = io_prep_linked_timeout(req);
+ if (req->work.creds && req->work.creds != current_cred()) {
+ if (old_creds)
+ revert_creds(old_creds);
+ if (old_creds == req->work.creds)
+ old_creds = NULL; /* restored original creds */
+ else
+ old_creds = override_creds(req->work.creds);
+ }
+
ret = io_issue_sqe(req, sqe, &nxt, true);
/*
@@ -4669,7 +4756,7 @@ punt:
err:
/* drop submission reference */
- io_put_req(req);
+ io_put_req_find_next(req, &nxt);
if (linked_timeout) {
if (!ret)
@@ -4693,6 +4780,8 @@ done_req:
goto punt;
goto again;
}
+ if (old_creds)
+ revert_creds(old_creds);
}
static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4737,7 +4826,6 @@ static inline void io_queue_link_head(struct io_kiocb *req)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
- const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
int ret, id;
@@ -4752,14 +4840,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
id = READ_ONCE(sqe->personality);
if (id) {
- const struct cred *personality_creds;
-
- personality_creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!personality_creds)) {
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds)) {
ret = -EINVAL;
goto err_req;
}
- old_creds = override_creds(personality_creds);
+ get_cred(req->work.creds);
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -4771,8 +4857,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
err_req:
io_cqring_add_event(req, ret);
io_double_put_req(req);
- if (old_creds)
- revert_creds(old_creds);
return false;
}
@@ -4833,8 +4917,6 @@ err_req:
}
}
- if (old_creds)
- revert_creds(old_creds);
return true;
}
@@ -4950,6 +5032,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
+ int err;
req = io_get_req(ctx, statep);
if (unlikely(!req)) {
@@ -4966,20 +5049,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
submitted++;
if (unlikely(req->opcode >= IORING_OP_LAST)) {
- io_cqring_add_event(req, -EINVAL);
+ err = -EINVAL;
+fail_req:
+ io_cqring_add_event(req, err);
io_double_put_req(req);
break;
}
if (io_op_defs[req->opcode].needs_mm && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
- if (!mm_fault) {
- use_mm(ctx->sqo_mm);
- *mm = ctx->sqo_mm;
+ if (unlikely(mm_fault)) {
+ err = -EFAULT;
+ goto fail_req;
}
+ use_mm(ctx->sqo_mm);
+ *mm = ctx->sqo_mm;
}
- req->has_user = *mm != NULL;
req->in_async = async;
req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
@@ -5011,9 +5097,8 @@ static int io_sq_thread(void *data)
const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
- unsigned inflight;
unsigned long timeout;
- int ret;
+ int ret = 0;
complete(&ctx->completions[1]);
@@ -5021,39 +5106,19 @@ static int io_sq_thread(void *data)
set_fs(USER_DS);
old_cred = override_creds(ctx->creds);
- ret = timeout = inflight = 0;
+ timeout = jiffies + ctx->sq_thread_idle;
while (!kthread_should_park()) {
unsigned int to_submit;
- if (inflight) {
+ if (!list_empty(&ctx->poll_list)) {
unsigned nr_events = 0;
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- /*
- * inflight is the count of the maximum possible
- * entries we submitted, but it can be smaller
- * if we dropped some of them. If we don't have
- * poll entries available, then we know that we
- * have nothing left to poll for. Reset the
- * inflight count to zero in that case.
- */
- mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->poll_list))
- __io_iopoll_check(ctx, &nr_events, 0);
- else
- inflight = 0;
- mutex_unlock(&ctx->uring_lock);
- } else {
- /*
- * Normal IO, just pretend everything completed.
- * We don't have to poll completions for that.
- */
- nr_events = inflight;
- }
-
- inflight -= nr_events;
- if (!inflight)
+ mutex_lock(&ctx->uring_lock);
+ if (!list_empty(&ctx->poll_list))
+ io_iopoll_getevents(ctx, &nr_events, 0);
+ else
timeout = jiffies + ctx->sq_thread_idle;
+ mutex_unlock(&ctx->uring_lock);
}
to_submit = io_sqring_entries(ctx);
@@ -5064,34 +5129,47 @@ static int io_sq_thread(void *data)
*/
if (!to_submit || ret == -EBUSY) {
/*
+ * Drop cur_mm before scheduling, we can't hold it for
+ * long periods (or over schedule()). Do this before
+ * adding ourselves to the waitqueue, as the unuse/drop
+ * may sleep.
+ */
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ cur_mm = NULL;
+ }
+
+ /*
* We're polling. If we're within the defined idle
* period, then let us spin without work before going
* to sleep. The exception is if we got EBUSY doing
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (inflight ||
+ if (!list_empty(&ctx->poll_list) ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
cond_resched();
continue;
}
+ prepare_to_wait(&ctx->sqo_wait, &wait,
+ TASK_INTERRUPTIBLE);
+
/*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
+ * While doing polled IO, before going to sleep, we need
+ * to check if there are new reqs added to poll_list, it
+ * is because reqs may have been punted to io worker and
+ * will be added to poll_list later, hence check the
+ * poll_list again.
*/
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- cur_mm = NULL;
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->poll_list)) {
+ finish_wait(&ctx->sqo_wait, &wait);
+ continue;
}
- prepare_to_wait(&ctx->sqo_wait, &wait,
- TASK_INTERRUPTIBLE);
-
/* Tell userspace we may need a wakeup call */
ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
/* make sure to read SQ tail after writing flags */
@@ -5119,8 +5197,7 @@ static int io_sq_thread(void *data)
mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
mutex_unlock(&ctx->uring_lock);
- if (ret > 0)
- inflight += ret;
+ timeout = jiffies + ctx->sq_thread_idle;
}
set_fs(old_fs);
@@ -5254,6 +5331,26 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
+static void __io_file_ref_exit_and_free(struct rcu_head *rcu)
+{
+ struct fixed_file_data *data = container_of(rcu, struct fixed_file_data,
+ rcu);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
+}
+
+static void io_file_ref_exit_and_free(struct rcu_head *rcu)
+{
+ /*
+ * We need to order our exit+free call against the potentially
+ * existing call_rcu() for switching to atomic. One way to do that
+ * is to have this rcu callback queue the final put and free, as we
+ * could otherwise have a pre-existing atomic switch complete _after_
+ * the free callback we queued.
+ */
+ call_rcu(rcu, __io_file_ref_exit_and_free);
+}
+
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
@@ -5266,14 +5363,13 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
flush_work(&data->ref_work);
wait_for_completion(&data->done);
io_ring_file_ref_flush(data);
- percpu_ref_exit(&data->refs);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- kfree(data);
+ call_rcu(&data->rcu, io_file_ref_exit_and_free);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -5525,7 +5621,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)
data = container_of(work, struct fixed_file_data, ref_work);
io_ring_file_ref_flush(data);
- percpu_ref_get(&data->refs);
percpu_ref_switch_to_percpu(&data->refs);
}
@@ -5701,8 +5796,13 @@ static void io_atomic_switch(struct percpu_ref *ref)
{
struct fixed_file_data *data;
+ /*
+ * Juggle reference to ensure we hit zero, if needed, so we can
+ * switch back to percpu mode
+ */
data = container_of(ref, struct fixed_file_data, refs);
- clear_bit(FFD_F_ATOMIC, &data->state);
+ percpu_ref_put(&data->refs);
+ percpu_ref_get(&data->refs);
}
static bool io_queue_file_removal(struct fixed_file_data *data,
@@ -5725,11 +5825,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data,
llist_add(&pfile->llist, &data->put_llist);
if (pfile == &pfile_stack) {
- if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
- percpu_ref_put(&data->refs);
- percpu_ref_switch_to_atomic(&data->refs,
- io_atomic_switch);
- }
+ percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
wait_for_completion(&done);
flush_work(&data->ref_work);
return false;
@@ -5803,10 +5899,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
- percpu_ref_put(&data->refs);
+ if (ref_switch)
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
- }
return done ? done : err;
}
@@ -6264,6 +6358,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
+ idr_destroy(&ctx->personality_idr);
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
@@ -6301,7 +6396,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
ctx->rings->sq_ring_entries)
mask |= EPOLLOUT | EPOLLWRNORM;
- if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
+ if (io_cqring_events(ctx, false))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
@@ -6393,6 +6488,29 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
if (!cancel_req)
break;
+ if (cancel_req->flags & REQ_F_OVERFLOW) {
+ spin_lock_irq(&ctx->completion_lock);
+ list_del(&cancel_req->list);
+ cancel_req->flags &= ~REQ_F_OVERFLOW;
+ if (list_empty(&ctx->cq_overflow_list)) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ WRITE_ONCE(ctx->rings->cq_overflow,
+ atomic_inc_return(&ctx->cached_cq_overflow));
+
+ /*
+ * Put inflight ref and overflow ref. If that's
+ * all we had, then we're done with this request.
+ */
+ if (refcount_sub_and_test(2, &cancel_req->refs)) {
+ io_put_req(cancel_req);
+ continue;
+ }
+ }
+
io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
io_put_req(cancel_req);
schedule();
@@ -6405,6 +6523,13 @@ static int io_uring_flush(struct file *file, void *data)
struct io_ring_ctx *ctx = file->private_data;
io_uring_cancel_files(ctx, data);
+
+ /*
+ * If the task is going away, cancel work it may have pending
+ */
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+ io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+
return 0;
}
@@ -6547,6 +6672,7 @@ out_fput:
return submitted ? submitted : ret;
}
+#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
const struct cred *cred = p;
@@ -6620,6 +6746,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
percpu_ref_put(&ctx->refs);
}
}
+#endif
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
@@ -6631,7 +6758,9 @@ static const struct file_operations io_uring_fops = {
#endif
.poll = io_uring_poll,
.fasync = io_uring_fasync,
+#ifdef CONFIG_PROC_FS
.show_fdinfo = io_uring_show_fdinfo,
+#endif
};
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2494095e0340..27373f5792a4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -976,29 +976,33 @@ restart_loop:
* it. */
/*
- * A buffer which has been freed while still being journaled by
- * a previous transaction.
- */
- if (buffer_freed(bh)) {
+ * A buffer which has been freed while still being journaled
+ * by a previous transaction, refile the buffer to BJ_Forget of
+ * the running transaction. If the just committed transaction
+ * contains "add to orphan" operation, we can completely
+ * invalidate the buffer now. We are rather through in that
+ * since the buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial page.
+ */
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
+ struct address_space *mapping;
+
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+
/*
- * If the running transaction is the one containing
- * "add to orphan" operation (b_next_transaction !=
- * NULL), we have to wait for that transaction to
- * commit before we can really get rid of the buffer.
- * So just clear b_modified to not confuse transaction
- * credit accounting and refile the buffer to
- * BJ_Forget of the running transaction. If the just
- * committed transaction contains "add to orphan"
- * operation, we can completely invalidate the buffer
- * now. We are rather through in that since the
- * buffer may be still accessible when blocksize <
- * pagesize and it is attached to the last partial
- * page.
+ * Block device buffers need to stay mapped all the
+ * time, so it is enough to clear buffer_jbddirty and
+ * buffer_freed bits. For the file mapping buffers (i.e.
+ * journalled data) we need to unmap buffer and clear
+ * more bits. We also need to be careful about the check
+ * because the data page mapping can get cleared under
+ * out hands, which alse need not to clear more bits
+ * because the page and buffers will be freed and can
+ * never be reused once we are done with them.
*/
- jh->b_modified = 0;
- if (!jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ mapping = READ_ONCE(bh->b_page->mapping);
+ if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_req(bh);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e77a5a0b4e46..3dccc23cf010 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -936,8 +936,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
char *frozen_buffer = NULL;
unsigned long start_lock, time_lock;
- if (is_handle_aborted(handle))
- return -EROFS;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -1152,8 +1150,8 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
/* For undo access buffer must have data copied */
if (undo && !jh->b_committed_data)
goto out;
- if (jh->b_transaction != handle->h_transaction &&
- jh->b_next_transaction != handle->h_transaction)
+ if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
+ READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
goto out;
/*
* There are two reasons for the barrier here:
@@ -1189,6 +1187,9 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int rc;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
if (jbd2_write_access_granted(handle, bh, false))
return 0;
@@ -1326,6 +1327,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
char *committed_data = NULL;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
if (jbd2_write_access_granted(handle, bh, true))
return 0;
@@ -2329,14 +2333,16 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
return -EBUSY;
}
/*
- * OK, buffer won't be reachable after truncate. We just set
- * j_next_transaction to the running transaction (if there is
- * one) and mark buffer as freed so that commit code knows it
- * should clear dirty bits when it is done with the buffer.
+ * OK, buffer won't be reachable after truncate. We just clear
+ * b_modified to not confuse transaction credit accounting, and
+ * set j_next_transaction to the running transaction (if there
+ * is one) and mark buffer as freed so that commit code knows
+ * it should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
+ jh->b_modified = 0;
spin_unlock(&journal->j_list_lock);
spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
@@ -2563,8 +2569,8 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh)
* our jh reference and thus __jbd2_journal_file_buffer() must not
* take a new one.
*/
- jh->b_transaction = jh->b_next_transaction;
- jh->b_next_transaction = NULL;
+ WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
+ WRITE_ONCE(jh->b_next_transaction, NULL);
if (buffer_freed(bh))
jlist = BJ_Forget;
else if (jh->b_modified)
diff --git a/fs/locks.c b/fs/locks.c
index 44b6da032842..426b55d333d5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -753,20 +753,6 @@ int locks_delete_block(struct file_lock *waiter)
{
int status = -ENOENT;
- /*
- * If fl_blocker is NULL, it won't be set again as this thread
- * "owns" the lock and is the only one that might try to claim
- * the lock. So it is safe to test fl_blocker locklessly.
- * Also if fl_blocker is NULL, this waiter is not listed on
- * fl_blocked_requests for some lock, so no other request can
- * be added to the list of fl_blocked_requests for this
- * request. So if fl_blocker is NULL, it is safe to
- * locklessly check if fl_blocked_requests is empty. If both
- * of these checks succeed, there is no need to take the lock.
- */
- if (waiter->fl_blocker == NULL &&
- list_empty(&waiter->fl_blocked_requests))
- return status;
spin_lock(&blocked_lock_lock);
if (waiter->fl_blocker)
status = 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 4a841071d8a7..1865322de142 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -42,13 +42,27 @@ static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation)
if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
atomic_long_dec(&nfs_active_delegations);
+ if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ nfs_clear_verifier_delegated(delegation->inode);
}
}
+static struct nfs_delegation *nfs_get_delegation(struct nfs_delegation *delegation)
+{
+ refcount_inc(&delegation->refcount);
+ return delegation;
+}
+
+static void nfs_put_delegation(struct nfs_delegation *delegation)
+{
+ if (refcount_dec_and_test(&delegation->refcount))
+ __nfs_free_delegation(delegation);
+}
+
static void nfs_free_delegation(struct nfs_delegation *delegation)
{
nfs_mark_delegation_revoked(delegation);
- __nfs_free_delegation(delegation);
+ nfs_put_delegation(delegation);
}
/**
@@ -241,13 +255,18 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
{
+ const struct cred *cred;
int res = 0;
- if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
- res = nfs4_proc_delegreturn(inode,
- delegation->cred,
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ spin_lock(&delegation->lock);
+ cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
+ res = nfs4_proc_delegreturn(inode, cred,
&delegation->stateid,
issync);
+ put_cred(cred);
+ }
return res;
}
@@ -273,9 +292,13 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
if (delegation == NULL)
goto out;
spin_lock(&delegation->lock);
- if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
- ret = delegation;
+ if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
+ }
spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(&nfsi->vfs_inode);
out:
return ret;
}
@@ -393,6 +416,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
if (delegation == NULL)
return -ENOMEM;
nfs4_stateid_copy(&delegation->stateid, stateid);
+ refcount_set(&delegation->refcount, 1);
delegation->type = type;
delegation->pagemod_limit = pagemod_limit;
delegation->change_attr = inode_peek_iversion_raw(inode);
@@ -492,6 +516,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
err = nfs_do_return_delegation(inode, delegation, issync);
out:
+ /* Refcount matched in nfs_start_delegation_return_locked() */
+ nfs_put_delegation(delegation);
return err;
}
@@ -686,9 +712,12 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode)
list_empty(&NFS_I(inode)->open_files) &&
!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
- ret = delegation;
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
}
spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(inode);
}
out:
rcu_read_unlock();
@@ -1088,10 +1117,11 @@ restart:
delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
if (delegation != NULL) {
- delegation = nfs_detach_delegation(NFS_I(inode),
- delegation, server);
- if (delegation != NULL)
+ if (nfs_detach_delegation(NFS_I(inode), delegation,
+ server) != NULL)
nfs_free_delegation(delegation);
+ /* Match nfs_start_delegation_return_locked */
+ nfs_put_delegation(delegation);
}
iput(inode);
nfs_sb_deactive(server->super);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 31b84604d383..9b00a0b7f832 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -22,6 +22,7 @@ struct nfs_delegation {
unsigned long pagemod_limit;
__u64 change_attr;
unsigned long flags;
+ refcount_t refcount;
spinlock_t lock;
struct rcu_head rcu;
};
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1320288ff9ec..193d6fb363b7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -155,6 +155,7 @@ typedef struct {
loff_t current_index;
decode_dirent_t decode;
+ unsigned long dir_verifier;
unsigned long timestamp;
unsigned long gencount;
unsigned int cache_entry_index;
@@ -353,6 +354,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
again:
timestamp = jiffies;
gencount = nfs_inc_attr_generation_counter();
+ desc->dir_verifier = nfs_save_change_attribute(inode);
error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
@@ -455,13 +457,13 @@ void nfs_force_use_readdirplus(struct inode *dir)
}
static
-void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
+ unsigned long dir_verifier)
{
struct qstr filename = QSTR_INIT(entry->name, entry->len);
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
struct dentry *dentry;
struct dentry *alias;
- struct inode *dir = d_inode(parent);
struct inode *inode;
int status;
@@ -500,7 +502,7 @@ again:
if (nfs_same_file(dentry, entry)) {
if (!entry->fh->size)
goto out;
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
if (!status)
nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label);
@@ -526,7 +528,7 @@ again:
dput(dentry);
dentry = alias;
}
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
out:
dput(dentry);
}
@@ -564,7 +566,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
count++;
if (desc->plus)
- nfs_prime_dcache(file_dentry(desc->file), entry);
+ nfs_prime_dcache(file_dentry(desc->file), entry,
+ desc->dir_verifier);
status = nfs_readdir_add_to_array(entry, page);
if (status != 0)
@@ -983,14 +986,113 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
* full lookup on all child dentries of 'dir' whenever a change occurs
* on the server that might have invalidated our dcache.
*
+ * Note that we reserve bit '0' as a tag to let us know when a dentry
+ * was revalidated while holding a delegation on its inode.
+ *
* The caller should be holding dir->i_lock
*/
void nfs_force_lookup_revalidate(struct inode *dir)
{
- NFS_I(dir)->cache_change_attribute++;
+ NFS_I(dir)->cache_change_attribute += 2;
}
EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
+/**
+ * nfs_verify_change_attribute - Detects NFS remote directory changes
+ * @dir: pointer to parent directory inode
+ * @verf: previously saved change attribute
+ *
+ * Return "false" if the verifiers doesn't match the change attribute.
+ * This would usually indicate that the directory contents have changed on
+ * the server, and that any dentries need revalidating.
+ */
+static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf)
+{
+ return (verf & ~1UL) == nfs_save_change_attribute(dir);
+}
+
+static void nfs_set_verifier_delegated(unsigned long *verf)
+{
+ *verf |= 1UL;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_unset_verifier_delegated(unsigned long *verf)
+{
+ *verf &= ~1UL;
+}
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
+static bool nfs_test_verifier_delegated(unsigned long verf)
+{
+ return verf & 1;
+}
+
+static bool nfs_verifier_is_delegated(struct dentry *dentry)
+{
+ return nfs_test_verifier_delegated(dentry->d_time);
+}
+
+static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (!nfs_verifier_is_delegated(dentry) &&
+ !nfs_verify_change_attribute(d_inode(dentry->d_parent), verf))
+ goto out;
+ if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ nfs_set_verifier_delegated(&verf);
+out:
+ dentry->d_time = verf;
+}
+
+/**
+ * nfs_set_verifier - save a parent directory verifier in the dentry
+ * @dentry: pointer to dentry
+ * @verf: verifier to save
+ *
+ * Saves the parent directory verifier in @dentry. If the inode has
+ * a delegation, we also tag the dentry as having been revalidated
+ * while holding a delegation so that we know we don't have to
+ * look it up again after a directory change.
+ */
+void nfs_set_verifier(struct dentry *dentry, unsigned long verf)
+{
+
+ spin_lock(&dentry->d_lock);
+ nfs_set_verifier_locked(dentry, verf);
+ spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_set_verifier);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+/**
+ * nfs_clear_verifier_delegated - clear the dir verifier delegation tag
+ * @inode: pointer to inode
+ *
+ * Iterates through the dentries in the inode alias list and clears
+ * the tag used to indicate that the dentry has been revalidated
+ * while holding a delegation.
+ * This function is intended for use when the delegation is being
+ * returned or revoked.
+ */
+void nfs_clear_verifier_delegated(struct inode *inode)
+{
+ struct dentry *alias;
+
+ if (!inode)
+ return;
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ nfs_unset_verifier_delegated(&alias->d_time);
+ spin_unlock(&alias->d_lock);
+ }
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated);
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
/*
* A check for whether or not the parent directory has changed.
* In the case it has, we assume that the dentries are untrustworthy
@@ -1159,6 +1261,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
struct nfs_fh *fhandle;
struct nfs_fattr *fattr;
struct nfs4_label *label;
+ unsigned long dir_verifier;
int ret;
ret = -ENOMEM;
@@ -1168,6 +1271,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
if (fhandle == NULL || fattr == NULL || IS_ERR(label))
goto out;
+ dir_verifier = nfs_save_change_attribute(dir);
ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
if (ret < 0) {
switch (ret) {
@@ -1188,7 +1292,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
goto out;
nfs_setsecurity(inode, fattr, label);
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
/* set a readdirplus hint that we had a cache miss */
nfs_force_use_readdirplus(dir);
@@ -1230,7 +1334,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
goto out_bad;
}
- if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+ if (nfs_verifier_is_delegated(dentry))
return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* Force a full look up iff the parent directory has changed */
@@ -1415,6 +1519,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs4_label *label = NULL;
+ unsigned long dir_verifier;
int error;
dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
@@ -1440,6 +1545,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (IS_ERR(label))
goto out;
+ dir_verifier = nfs_save_change_attribute(dir);
trace_nfs_lookup_enter(dir, dentry, flags);
error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
if (error == -ENOENT)
@@ -1463,7 +1569,7 @@ no_entry:
goto out_label;
dentry = res;
}
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
out_label:
trace_nfs_lookup_exit(dir, dentry, flags, error);
nfs4_label_free(label);
@@ -1668,7 +1774,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
if (inode == NULL)
goto full_reval;
- if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+ if (nfs_verifier_is_delegated(dentry))
return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* NFS only supports OPEN on regular files */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1309e6f47f3d..11bf15800ac9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2114,6 +2114,7 @@ static void init_once(void *foo)
init_rwsem(&nfsi->rmdir_sem);
mutex_init(&nfsi->commit_mutex);
nfs4_init_once(nfsi);
+ nfsi->cache_change_attribute = 0;
}
static int __init nfs_init_inodecache(void)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index be4eb720d5b6..1297919e0fce 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -87,7 +87,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (inode != d_inode(dentry))
goto out_drop;
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
nfs_file_set_open_context(filp, ctx);
nfs_fscache_open_file(inode, filp);
err = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95d07a3dc5d1..69b7ab7a5815 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2974,10 +2974,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
struct dentry *dentry;
struct nfs4_state *state;
fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
+ struct inode *dir = d_inode(opendata->dir);
+ unsigned long dir_verifier;
unsigned int seq;
int ret;
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ dir_verifier = nfs_save_change_attribute(dir);
ret = _nfs4_proc_open(opendata, ctx);
if (ret != 0)
@@ -3005,8 +3008,19 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dput(ctx->dentry);
ctx->dentry = dentry = alias;
}
- nfs_set_verifier(dentry,
- nfs_save_change_attribute(d_inode(opendata->dir)));
+ }
+
+ switch(opendata->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!opendata->rpc_done)
+ break;
+ if (opendata->o_res.delegation_type != 0)
+ dir_verifier = nfs_save_change_attribute(dir);
+ nfs_set_verifier(dentry, dir_verifier);
}
/* Parse layoutget results before we check for access */
@@ -5322,7 +5336,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
hdr->timestamp = jiffies;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1, 0);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 5a34d6c22d4c..2144507447c5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -722,9 +722,10 @@ pipe_release(struct inode *inode, struct file *file)
if (file->f_mode & FMODE_WRITE)
pipe->writers--;
- if (pipe->readers || pipe->writers) {
- wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLHUP);
- wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
+ /* Was that the last reader or writer, but not the other side? */
+ if (!pipe->readers != !pipe->writers) {
+ wake_up_interruptible_all(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->wr_wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
@@ -1026,8 +1027,8 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
static void wake_up_partner(struct pipe_inode_info *pipe)
{
- wake_up_interruptible(&pipe->rd_wait);
- wake_up_interruptible(&pipe->wr_wait);
+ wake_up_interruptible_all(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->wr_wait);
}
static int fifo_open(struct inode *inode, struct file *filp)
@@ -1144,7 +1145,7 @@ err_rd:
err_wr:
if (!--pipe->writers)
- wake_up_interruptible(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->rd_wait);
ret = -ERESTARTSYS;
goto err;
@@ -1271,8 +1272,9 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;
- wake_up_interruptible_all(&pipe->rd_wait);
- wake_up_interruptible_all(&pipe->wr_wait);
+
+ /* This might have made more room for writers */
+ wake_up_interruptible(&pipe->wr_wait);
return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a688eb5c5ae..58e937be24ce 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -587,7 +587,7 @@ xfs_dax_writepages(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
return dax_writeback_mapping_range(mapping,
- xfs_inode_buftarg(ip)->bt_bdev, wbc);
+ xfs_inode_buftarg(ip)->bt_daxdev, wbc);
}
STATIC sector_t
diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig
index fb87ad372e29..ef2697b78820 100644
--- a/fs/zonefs/Kconfig
+++ b/fs/zonefs/Kconfig
@@ -2,6 +2,7 @@ config ZONEFS_FS
tristate "zonefs filesystem support"
depends on BLOCK
depends on BLK_DEV_ZONED
+ select FS_IOMAP
help
zonefs is a simple file system which exposes zones of a zoned block
device (e.g. host-managed or host-aware SMR disk drives) as files.
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8bc6ef82d693..69aee3dfb660 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -601,13 +601,13 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
/*
- * For async direct IOs to sequential zone files, ignore IOCB_NOWAIT
+ * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
* as this can cause write reordering (e.g. the first aio gets EAGAIN
* on the inode lock but the second goes through but is now unaligned).
*/
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb)
- && (iocb->ki_flags & IOCB_NOWAIT))
- iocb->ki_flags &= ~IOCB_NOWAIT;
+ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) &&
+ (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode))