aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/btrfs/block-group.h1
-rw-r--r--fs/btrfs/ctree.h20
-rw-r--r--fs/btrfs/delayed-inode.c84
-rw-r--r--fs/btrfs/disk-io.c179
-rw-r--r--fs/btrfs/extent-tree.c22
-rw-r--r--fs/btrfs/extent_io.c125
-rw-r--r--fs/btrfs/file.c92
-rw-r--r--fs/btrfs/inode.c32
-rw-r--r--fs/btrfs/locking.c3
-rw-r--r--fs/btrfs/reflink.c16
-rw-r--r--fs/btrfs/send.c40
-rw-r--r--fs/btrfs/tests/btrfs-tests.c24
-rw-r--r--fs/btrfs/transaction.c112
-rw-r--r--fs/btrfs/zoned.c61
-rw-r--r--fs/btrfs/zoned.h5
-rw-r--r--fs/cachefiles/ondemand.c3
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/caps.c1
-rw-r--r--fs/cifs/cifs_debug.c12
-rw-r--r--fs/cifs/cifsglob.h58
-rw-r--r--fs/cifs/cifsproto.h7
-rw-r--r--fs/cifs/connect.c61
-rw-r--r--fs/cifs/misc.c9
-rw-r--r--fs/cifs/sess.c177
-rw-r--r--fs/cifs/smb2ops.c173
-rw-r--r--fs/cifs/smb2pdu.c22
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exfat/namei.c4
-rw-r--r--fs/f2fs/iostat.c31
-rw-r--r--fs/f2fs/namei.c17
-rw-r--r--fs/f2fs/node.c4
-rw-r--r--fs/fscache/cookie.c26
-rw-r--r--fs/fscache/volume.c4
-rw-r--r--fs/hugetlbfs/inode.c68
-rw-r--r--fs/io_uring.c50
-rw-r--r--fs/ksmbd/smb2pdu.c48
-rw-r--r--fs/ksmbd/transport_rdma.c10
-rw-r--r--fs/ksmbd/transport_tcp.c2
-rw-r--r--fs/ksmbd/vfs.c12
-rw-r--r--fs/lockd/svcsubs.c14
-rw-r--r--fs/netfs/buffered_read.c17
-rw-r--r--fs/nfs/nfs4proc.c19
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfsd/nfs4xdr.c9
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/vfs.c11
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/notify/fanotify/fanotify_user.c34
-rw-r--r--fs/overlayfs/super.c25
-rw-r--r--fs/read_write.c77
-rw-r--r--fs/remap_range.c3
-rw-r--r--fs/xfs/libxfs/xfs_attr.c38
-rw-r--r--fs/xfs/libxfs/xfs_attr.h5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c35
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/xfs_attr_item.c27
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_icache.c56
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c64
-rw-r--r--fs/xfs/xfs_log.c9
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c9
-rw-r--r--fs/xfs/xfs_super.c9
-rw-r--r--fs/xfs/xfs_trace.h1
66 files changed, 1356 insertions, 751 deletions
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 42118a4f3383..d1cfb235c4b9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -375,7 +375,7 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq)
}
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 3ac668ace50a..35e0e860cc0b 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -104,6 +104,7 @@ struct btrfs_block_group {
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
unsigned int zone_is_active:1;
+ unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0e49b1a0c071..9c21e214d29e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,9 +675,8 @@ struct btrfs_fs_info {
rwlock_t global_root_lock;
struct rb_root global_root_tree;
- /* The xarray that holds all the FS roots */
- spinlock_t fs_roots_lock;
- struct xarray fs_roots;
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
rwlock_t block_group_cache_lock;
@@ -995,10 +994,10 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer xarray */
+ /* Extent buffer radix tree */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
- struct xarray extent_buffers;
+ struct radix_tree_root buffer_radix;
/* next backup root to be overwritten */
int backup_root_index;
@@ -1119,8 +1118,7 @@ enum {
*/
BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
- /* The root is tracked in fs_info::fs_roots */
- BTRFS_ROOT_REGISTERED,
+ BTRFS_ROOT_IN_RADIX,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
BTRFS_ROOT_DEFRAG_RUNNING,
BTRFS_ROOT_FORCE_COW,
@@ -1224,10 +1222,10 @@ struct btrfs_root {
struct rb_root inode_tree;
/*
- * Xarray that keeps track of delayed nodes of every inode, protected
- * by inode_lock
+ * radix tree that keeps track of delayed nodes of every inode,
+ * protected by inode_lock
*/
- struct xarray delayed_nodes;
+ struct radix_tree_root delayed_nodes_tree;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -1330,6 +1328,8 @@ struct btrfs_replace_extent_info {
* existing extent into a file range.
*/
bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
/* Meaningful only if is_new_extent is true. */
int qgroup_reserved;
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 66779ab3ed4a..748bf6b0d860 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = xa_load(&root->delayed_nodes, ino);
+ node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the xarray. In this case, the refcount
+ * this node from the radix tree. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the xarray at all; our release
+ * NULL like it was never in the radix at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the xarray, we want to bump the
+ * If this node is properly in the radix, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -128,30 +128,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
u64 ino = btrfs_ino(btrfs_inode);
int ret;
- do {
- node = btrfs_get_delayed_node(btrfs_inode);
- if (node)
- return node;
+again:
+ node = btrfs_get_delayed_node(btrfs_inode);
+ if (node)
+ return node;
- node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
- if (!node)
- return ERR_PTR(-ENOMEM);
- btrfs_init_delayed_node(node, root, ino);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
- /* Cached in the inode and can be accessed */
- refcount_set(&node->refs, 2);
+ /* cached in the btrfs inode and can be accessed */
+ refcount_set(&node->refs, 2);
- spin_lock(&root->inode_lock);
- ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
- if (ret) {
- spin_unlock(&root->inode_lock);
- kmem_cache_free(delayed_node_cache, node);
- if (ret != -EBUSY)
- return ERR_PTR(ret);
- }
- } while (ret);
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ kmem_cache_free(delayed_node_cache, node);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&root->inode_lock);
+ ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+ if (ret == -EEXIST) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ radix_tree_preload_end();
+ goto again;
+ }
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
+ radix_tree_preload_end();
return node;
}
@@ -270,7 +276,8 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- xa_erase(&root->delayed_nodes, delayed_node->inode_id);
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -1863,35 +1870,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- unsigned long index = 0;
- struct btrfs_delayed_node *delayed_node;
+ u64 inode_id = 0;
struct btrfs_delayed_node *delayed_nodes[8];
+ int i, n;
while (1) {
- int n = 0;
-
spin_lock(&root->inode_lock);
- if (xa_empty(&root->delayed_nodes)) {
+ n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+ (void **)delayed_nodes, inode_id,
+ ARRAY_SIZE(delayed_nodes));
+ if (!n) {
spin_unlock(&root->inode_lock);
- return;
+ break;
}
- xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
+ inode_id = delayed_nodes[n - 1]->inode_id + 1;
+ for (i = 0; i < n; i++) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (refcount_inc_not_zero(&delayed_node->refs)) {
- delayed_nodes[n] = delayed_node;
- n++;
- }
- if (n >= ARRAY_SIZE(delayed_nodes))
- break;
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
}
- index++;
spin_unlock(&root->inode_lock);
- for (int i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ba005c41983..de440ebf5648 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,6 +5,7 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
+#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
@@ -485,7 +486,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
- /* A dirty eb shouldn't disappear from extent_buffers */
+ /* A dirty eb shouldn't disappear from buffer_radix */
if (WARN_ON(!eb))
return -EUCLEAN;
@@ -1158,7 +1159,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
+ INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -1210,9 +1211,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
- spin_lock(&fs_info->fs_roots_lock);
+ spin_lock(&fs_info->fs_roots_radix_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
}
@@ -1659,11 +1660,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root;
- spin_lock(&fs_info->fs_roots_lock);
- root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
@@ -1705,14 +1707,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- spin_lock(&fs_info->fs_roots_lock);
- ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
- root, GFP_NOFS);
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
if (ret == 0) {
btrfs_grab_root(root);
- set_bit(BTRFS_ROOT_REGISTERED, &root->state);
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
return ret;
}
@@ -2342,9 +2350,9 @@ void btrfs_put_root(struct btrfs_root *root)
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
- spin_lock(&root->fs_info->fs_roots_lock);
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
list_del_init(&root->leak_list);
- spin_unlock(&root->fs_info->fs_roots_lock);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
#endif
kfree(root);
}
@@ -2352,21 +2360,28 @@ void btrfs_put_root(struct btrfs_root *root)
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root;
- unsigned long index = 0;
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
while (!list_empty(&fs_info->dead_roots)) {
- root = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
- list_del(&root->root_list);
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
- if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
- btrfs_drop_and_free_fs_root(fs_info, root);
- btrfs_put_root(root);
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+ btrfs_put_root(gang[0]);
}
- xa_for_each(&fs_info->fs_roots, index, root) {
- btrfs_drop_and_free_fs_root(fs_info, root);
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
}
@@ -3134,8 +3149,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
- xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -3143,7 +3158,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->fs_roots_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
@@ -3374,7 +3389,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
- * them into the fs_info->fs_roots. This must be done before
+ * them into the fs_info->fs_roots_radix tree. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
@@ -4499,11 +4514,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
{
bool drop_ref = false;
- spin_lock(&fs_info->fs_roots_lock);
- xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
- if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
drop_ref = true;
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
@@ -4519,48 +4535,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *roots[8];
- unsigned long index = 0;
- int i;
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i = 0;
int err = 0;
- int grabbed;
+ unsigned int ret = 0;
while (1) {
- struct btrfs_root *root;
-
- spin_lock(&fs_info->fs_roots_lock);
- if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
- spin_unlock(&fs_info->fs_roots_lock);
- return err;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ break;
}
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
- grabbed = 0;
- xa_for_each_start(&fs_info->fs_roots, index, root, index) {
- /* Avoid grabbing roots in dead_roots */
- if (btrfs_root_refs(&root->root_item) > 0)
- roots[grabbed++] = btrfs_grab_root(root);
- if (grabbed >= ARRAY_SIZE(roots))
- break;
+ for (i = 0; i < ret; i++) {
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_root(gang[i]);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
- for (i = 0; i < grabbed; i++) {
- if (!roots[i])
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
continue;
- index = roots[i]->root_key.objectid;
- err = btrfs_orphan_cleanup(roots[i]);
+ root_objectid = gang[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(gang[i]);
if (err)
- goto out;
- btrfs_put_root(roots[i]);
+ break;
+ btrfs_put_root(gang[i]);
}
- index++;
+ root_objectid++;
}
-out:
- /* Release the roots that remain uncleaned due to error */
- for (; i < grabbed; i++) {
- if (roots[i])
- btrfs_put_root(roots[i]);
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_root(gang[i]);
}
return err;
}
@@ -4879,28 +4897,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
- unsigned long index = 0;
- int grabbed = 0;
- struct btrfs_root *roots[8];
+ struct btrfs_root *gang[8];
+ u64 root_objectid = 0;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang))) != 0) {
+ int i;
- spin_lock(&fs_info->fs_roots_lock);
- while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
- ULONG_MAX, 8, XA_PRESENT))) {
- for (int i = 0; i < grabbed; i++)
- roots[i] = btrfs_grab_root(roots[i]);
- spin_unlock(&fs_info->fs_roots_lock);
+ for (i = 0; i < ret; i++)
+ gang[i] = btrfs_grab_root(gang[i]);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
- for (int i = 0; i < grabbed; i++) {
- if (!roots[i])
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
continue;
- index = roots[i]->root_key.objectid;
- btrfs_free_log(NULL, roots[i]);
- btrfs_put_root(roots[i]);
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_free_log(NULL, gang[i]);
+ btrfs_put_root(gang[i]);
}
- index++;
- spin_lock(&fs_info->fs_roots_lock);
+ root_objectid++;
+ spin_lock(&fs_info->fs_roots_radix_lock);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log_root_tree(NULL, fs_info);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0867c5cd6e01..a3afc15430ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
- if (block_group->ro) {
+ if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
ret = 1;
goto out;
}
@@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
- if (ret && ffe_ctl->for_data_reloc)
+ if (ret && ffe_ctl->for_data_reloc &&
+ fs_info->data_reloc_bg == block_group->start) {
+ /*
+ * Do not allow further allocations from this block group.
+ * Compared to increasing the ->ro, setting the
+ * ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ */
+ block_group->zoned_data_reloc_ongoing = 1;
fs_info->data_reloc_bg = 0;
+ }
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
@@ -5813,7 +5829,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
- if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
else
btrfs_put_root(root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f6b544ae616..f03ab5dbda7a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2966,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
}
/*
- * Find extent buffer for a given bytenr.
+ * Find extent buffer for a givne bytenr.
*
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
* in endio context.
@@ -2985,9 +2985,11 @@ static struct extent_buffer *find_extent_buffer_readpage(
return (struct extent_buffer *)page->private;
}
- /* For subpage case, we need to lookup extent buffer xarray */
- eb = xa_load(&fs_info->extent_buffers,
- bytenr >> fs_info->sectorsize_bits);
+ /* For subpage case, we need to lookup buffer radix tree */
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ bytenr >> fs_info->sectorsize_bits);
+ rcu_read_unlock();
ASSERT(eb);
return eb;
}
@@ -4435,8 +4437,8 @@ static struct extent_buffer *find_extent_buffer_nolock(
struct extent_buffer *eb;
rcu_read_lock();
- eb = xa_load(&fs_info->extent_buffers,
- start >> fs_info->sectorsize_bits);
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
return eb;
@@ -5241,13 +5243,14 @@ int extent_writepages(struct address_space *mapping,
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
- btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
ASSERT(ret <= 0);
if (ret < 0) {
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
end_write_bio(&epd, ret);
return ret;
}
flush_write_bio(&epd);
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
@@ -6128,22 +6131,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
if (!eb)
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
-
- do {
- ret = xa_insert(&fs_info->extent_buffers,
- start >> fs_info->sectorsize_bits,
- eb, GFP_NOFS);
- if (ret == -ENOMEM) {
- exists = ERR_PTR(ret);
+again:
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
goto free_eb;
- }
- if (ret == -EBUSY) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
- }
- } while (ret);
-
+ else
+ goto again;
+ }
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6318,22 +6323,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
- do {
- ret = xa_insert(&fs_info->extent_buffers,
- start >> fs_info->sectorsize_bits,
- eb, GFP_NOFS);
- if (ret == -ENOMEM) {
- exists = ERR_PTR(ret);
+again:
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
+
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
goto free_eb;
- }
- if (ret == -EBUSY) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
- }
- } while (ret);
-
+ else
+ goto again;
+ }
/* add one reference for the tree */
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6378,8 +6386,10 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
- xa_erase(&fs_info->extent_buffers,
- eb->start >> fs_info->sectorsize_bits);
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_delete(&fs_info->buffer_radix,
+ eb->start >> fs_info->sectorsize_bits);
+ spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
}
@@ -7324,25 +7334,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *eb;
- unsigned long index;
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
+ struct extent_buffer *found = NULL;
u64 page_start = page_offset(page);
+ u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
lockdep_assert_held(&fs_info->buffer_lock);
- xa_for_each_start(&fs_info->extent_buffers, index, eb,
- page_start >> fs_info->sectorsize_bits) {
- if (in_range(eb->start, page_start, PAGE_SIZE))
- return eb;
- else if (eb->start >= page_start + PAGE_SIZE)
- /* Already beyond page end */
- return NULL;
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
+ }
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
}
- return NULL;
+out:
+ return found;
}
static int try_release_subpage_extent_buffer(struct page *page)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1fd827b99c1b..9dfde1af8a64 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret == BTRFS_NO_LOG_SYNC) {
+ ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ /* We successfully logged the inode, attempt to sync the log. */
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
- ret = btrfs_sync_log(trans, root, &ctx);
- if (!ret) {
- ret = btrfs_end_transaction(trans);
- goto out;
- }
- }
- if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start, len);
- if (ret) {
- btrfs_end_transaction(trans);
- goto out;
- }
+ ret = btrfs_end_transaction(trans);
+ goto out;
}
- ret = btrfs_commit_transaction(trans);
- } else {
+ }
+
+ /*
+ * At this point we need to commit the transaction because we had
+ * btrfs_need_log_full_commit() or some other error.
+ *
+ * If we didn't do a full sync we have to stop the trans handle, wait on
+ * the ordered extents, start it again and commit the transaction. If
+ * we attempt to wait on the ordered extents here we could deadlock with
+ * something like fallocate() that is holding the extent lock trying to
+ * start a transaction while some other thread is trying to commit the
+ * transaction while we (fsync) are currently holding the transaction
+ * open.
+ */
+ if (!full_sync) {
ret = btrfs_end_transaction(trans);
+ if (ret)
+ goto out;
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret)
+ goto out;
+
+ /*
+ * This is safe to use here because we're only interested in
+ * making sure the transaction that had the ordered extents is
+ * committed. We aren't waiting on anything past this point,
+ * we're purely getting the transaction and committing it.
+ */
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+
+ /*
+ * We committed the transaction and there's no currently
+ * running transaction, this means everything we care
+ * about made it to disk and we are done.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
}
+
+ ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
err = file_check_and_advance_wb_err(file);
@@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
- BUG_ON(ret);
+ if (WARN_ON(ret))
+ goto out_trans;
trans->block_rsv = rsv;
cur_offset = start;
@@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
extent_info->file_offset += replace_len;
}
+ /*
+ * We are releasing our handle on the transaction, balance the
+ * dirty pages of the btree inode and flush delayed items, and
+ * then get a new transaction handle, which may now point to a
+ * new transaction in case someone else may have committed the
+ * transaction we used to replace/drop file extent items. So
+ * bump the inode's iversion and update mtime and ctime except
+ * if we are called from a dedupe context. This is because a
+ * power failure/crash may happen after the transaction is
+ * committed and before we finish replacing/dropping all the
+ * file extent items we need.
+ */
+ inode_inc_iversion(&inode->vfs_inode);
+
+ if (!extent_info || extent_info->update_times) {
+ inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
+ }
+
ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
- BUG_ON(ret); /* shouldn't happen */
+ if (WARN_ON(ret))
+ break;
trans->block_rsv = rsv;
cur_offset = drop_args.drop_end;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81737eff92f3..d50448bf8eed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
+ btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
@@ -3576,7 +3578,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
- /* Bail out if the cleanup is already running. */
if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
@@ -3659,17 +3660,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*
* btrfs_find_orphan_roots() ran before us, which has
* found all deleted roots and loaded them into
- * fs_info->fs_roots. So here we can find if an
+ * fs_info->fs_roots_radix. So here we can find if an
* orphan item corresponds to a deleted root by looking
- * up the root from that xarray.
+ * up the root from that radix tree.
*/
- spin_lock(&fs_info->fs_roots_lock);
- dead_root = xa_load(&fs_info->fs_roots,
- (unsigned long)found_key.objectid);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)found_key.objectid);
if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
is_dead_root = 1;
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (is_dead_root) {
/* prevent this orphan from being found again */
@@ -3909,7 +3910,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in the delayed_nodes xarray.
+ * in delayed_nodes_tree.
*/
if (BTRFS_I(inode)->last_trans == fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -7679,7 +7680,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- ret = -ENOTBLK;
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
goto unlock_err;
}
@@ -9897,6 +9910,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
+ extent_info.update_times = true;
extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 313d9d685adb..33461b4f9c8b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
start_ns = ktime_get_ns();
down_read_nested(&eb->lock, nest);
- eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
@@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
- eb->lock_owner = current->pid;
trace_btrfs_try_tree_read_lock(eb);
return 1;
}
@@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
- eb->lock_owner = 0;
up_read(&eb->lock);
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index c39f8b3a5a4a..a3549d587464 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
+ u64 prev_extent_end = off;
ret = -ENOMEM;
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
@@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
- u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
u64 extent_gen;
int type;
@@ -431,14 +431,21 @@ process_slot:
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
+ *
+ * Subsequent searches may leave us on a file range we have
+ * processed before - this happens due to a race with ordered
+ * extent completion for a file range that is outside our source
+ * range, but that range was part of a file extent item that
+ * also covered a leading part of our source range.
*/
- if (key.offset + datal <= off) {
+ if (key.offset + datal <= prev_extent_end) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
- next_key_min_offset = key.offset + datal;
+
+ prev_extent_end = key.offset + datal;
size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
@@ -489,6 +496,7 @@ process_slot:
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
+ clone_info.update_times = !no_time_update;
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
drop_start, new_key.offset + datal - 1,
&clone_info, &trans);
@@ -550,7 +558,7 @@ process_slot:
break;
btrfs_release_path(path);
- key.offset = next_key_min_offset;
+ key.offset = prev_extent_end;
if (fatal_signal_pending(current)) {
ret = -EINTR;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fa56890ff81f..c7dea639a56f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -10,6 +10,7 @@
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/radix-tree.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/compat.h>
@@ -127,7 +128,7 @@ struct send_ctx {
struct list_head new_refs;
struct list_head deleted_refs;
- struct xarray name_cache;
+ struct radix_tree_root name_cache;
struct list_head name_cache_list;
int name_cache_size;
@@ -268,13 +269,14 @@ struct orphan_dir_info {
struct name_cache_entry {
struct list_head list;
/*
- * On 32bit kernels, xarray has only 32bit indices, but we need to
- * handle 64bit inums. We use the lower 32bit of the 64bit inum to store
- * it in the tree. If more than one inum would fall into the same entry,
- * we use inum_aliases to store the additional entries. inum_aliases is
- * also used to store entries with the same inum but different generations.
+ * radix_tree has only 32bit entries but we need to handle 64bit inums.
+ * We use the lower 32bit of the 64bit inum to store it in the tree. If
+ * more then one inum would fall into the same entry, we use radix_list
+ * to store the additional entries. radix_list is also used to store
+ * entries where two entries have the same inum but different
+ * generations.
*/
- struct list_head inum_aliases;
+ struct list_head radix_list;
u64 ino;
u64 gen;
u64 parent_ino;
@@ -2024,9 +2026,9 @@ out:
}
/*
- * Insert a name cache entry. On 32bit kernels the xarray index is 32bit,
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
* so we need to do some special handling in case we have clashes. This function
- * takes care of this with the help of name_cache_entry::inum_aliases.
+ * takes care of this with the help of name_cache_entry::radix_list.
* In case of error, nce is kfreed.
*/
static int name_cache_insert(struct send_ctx *sctx,
@@ -2035,7 +2037,8 @@ static int name_cache_insert(struct send_ctx *sctx,
int ret = 0;
struct list_head *nce_head;
- nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
if (!nce_head) {
@@ -2044,14 +2047,14 @@ static int name_cache_insert(struct send_ctx *sctx,
}
INIT_LIST_HEAD(nce_head);
- ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL);
+ ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
if (ret < 0) {
kfree(nce_head);
kfree(nce);
return ret;
}
}
- list_add_tail(&nce->inum_aliases, nce_head);
+ list_add_tail(&nce->radix_list, nce_head);
list_add_tail(&nce->list, &sctx->name_cache_list);
sctx->name_cache_size++;
@@ -2063,14 +2066,15 @@ static void name_cache_delete(struct send_ctx *sctx,
{
struct list_head *nce_head;
- nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
if (!nce_head) {
btrfs_err(sctx->send_root->fs_info,
"name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
nce->ino, sctx->name_cache_size);
}
- list_del(&nce->inum_aliases);
+ list_del(&nce->radix_list);
list_del(&nce->list);
sctx->name_cache_size--;
@@ -2078,7 +2082,7 @@ static void name_cache_delete(struct send_ctx *sctx,
* We may not get to the final release of nce_head if the lookup fails
*/
if (nce_head && list_empty(nce_head)) {
- xa_erase(&sctx->name_cache, (unsigned long)nce->ino);
+ radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
}
@@ -2089,11 +2093,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
struct list_head *nce_head;
struct name_cache_entry *cur;
- nce_head = xa_load(&sctx->name_cache, (unsigned long)ino);
+ nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
if (!nce_head)
return NULL;
- list_for_each_entry(cur, nce_head, inum_aliases) {
+ list_for_each_entry(cur, nce_head, radix_list) {
if (cur->ino == ino && cur->gen == gen)
return cur;
}
@@ -7518,7 +7522,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
- xa_init_flags(&sctx->name_cache, GFP_KERNEL);
+ INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
sctx->flags = arg->flags;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 1591bfa55bcc..d8e56edd6991 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
- unsigned long index;
- struct extent_buffer *eb;
+ struct radix_tree_iter iter;
+ void **slot;
struct btrfs_device *dev, *tmp;
if (!fs_info)
@@ -163,9 +163,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
- xa_for_each(&fs_info->extent_buffers, index, eb) {
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+ struct extent_buffer *eb;
+
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+ if (!eb)
+ continue;
+ /* Shouldn't happen but that kind of thinking creates CVE's */
+ if (radix_tree_exception(eb)) {
+ if (radix_tree_deref_retry(eb))
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ slot = radix_tree_iter_resume(slot, &iter);
+ spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb);
+ spin_lock(&fs_info->buffer_lock);
}
+ spin_unlock(&fs_info->buffer_lock);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
@@ -186,7 +202,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
if (!root)
return;
/* Will be freed by btrfs_free_fs_roots */
- if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state)))
+ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
btrfs_global_root_delete(root);
btrfs_put_root(root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 06c0a958d114..875b801ab3d7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,7 +23,7 @@
#include "space-info.h"
#include "zoned.h"
-#define BTRFS_ROOT_TRANS_TAG XA_MARK_0
+#define BTRFS_ROOT_TRANS_TAG 0
/*
* Transaction states and transitions
@@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
*/
smp_wmb();
- spin_lock(&fs_info->fs_roots_lock);
+ spin_lock(&fs_info->fs_roots_radix_lock);
if (root->last_trans == trans->transid && !force) {
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
- xa_set_mark(&fs_info->fs_roots,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_lock);
+ radix_tree_tag_set(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
root->last_trans = trans->transid;
/* this is pretty tricky. We don't want to
@@ -487,9 +487,11 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
spin_unlock(&cur_trans->dropped_roots_lock);
/* Make sure we don't try to update the root at commit time */
- xa_clear_mark(&fs_info->fs_roots,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
}
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
@@ -1402,8 +1404,9 @@ void btrfs_add_dead_root(struct btrfs_root *root)
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root;
- unsigned long index;
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
/*
* At this point no one can be using this transaction to modify any tree
@@ -1411,46 +1414,57 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
*/
ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
- spin_lock(&fs_info->fs_roots_lock);
- xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) {
- int ret;
-
- /*
- * At this point we can neither have tasks logging inodes
- * from a root nor trying to commit a log tree.
- */
- ASSERT(atomic_read(&root->log_writers) == 0);
- ASSERT(atomic_read(&root->log_commit[0]) == 0);
- ASSERT(atomic_read(&root->log_commit[1]) == 0);
-
- xa_clear_mark(&fs_info->fs_roots,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_lock);
-
- btrfs_free_log(trans, root);
- ret = btrfs_update_reloc_root(trans, root);
- if (ret)
- return ret;
-
- /* See comments in should_cow_block() */
- clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
- smp_mb__after_atomic();
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ struct btrfs_root *root = gang[i];
+ int ret2;
+
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ btrfs_free_log(trans, root);
+ ret2 = btrfs_update_reloc_root(trans, root);
+ if (ret2)
+ return ret2;
+
+ /* see comments in should_cow_block() */
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
+
+ if (root->commit_root != root->node) {
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_set_root_node(&root->root_item,
+ root->node);
+ }
- if (root->commit_root != root->node) {
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
- btrfs_set_root_node(&root->root_item, root->node);
+ ret2 = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (ret2)
+ return ret2;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ btrfs_qgroup_free_meta_all_pertrans(root);
}
-
- ret = btrfs_update_root(trans, fs_info->tree_root,
- &root->root_key, &root->root_item);
- if (ret)
- return ret;
- spin_lock(&fs_info->fs_roots_lock);
- btrfs_qgroup_free_meta_all_pertrans(root);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 11237a913bee..d99026df6f67 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1735,12 +1735,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&mapped_length, &bioc);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
- btrfs_put_bioc(bioc);
- return -EIO;
+ ret = -EIO;
+ goto out_put_bioc;
}
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- return -EINVAL;
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EINVAL;
+ goto out_put_bioc;
+ }
nofs_flag = memalloc_nofs_save();
nmirrors = (int)bioc->num_stripes;
@@ -1759,7 +1761,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
break;
}
memalloc_nofs_restore(nofs_flag);
-
+out_put_bioc:
+ btrfs_put_bioc(bioc);
return ret;
}
@@ -1885,7 +1888,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct map_lookup *map;
- bool need_zone_finish;
int ret = 0;
int i;
@@ -1942,12 +1944,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
}
}
- /*
- * The block group is not fully allocated, so not fully written yet. We
- * need to send ZONE_FINISH command to free up an active zone.
- */
- need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
-
block_group->zone_is_active = 0;
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
@@ -1963,15 +1959,13 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
if (device->zone_info->max_active_zones == 0)
continue;
- if (need_zone_finish) {
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
- if (ret)
- return ret;
- }
+ if (ret)
+ return ret;
btrfs_dev_clear_active_zone(device, physical);
}
@@ -2139,3 +2133,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
factor = div64_u64(used * 100, total);
return factor >= fs_info->bg_reclaim_threshold;
}
+
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ /* It should be called on a previous data relocation block group. */
+ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
+
+ spin_lock(&block_group->lock);
+ if (!block_group->zoned_data_reloc_ongoing)
+ goto out;
+
+ /* All relocation extents are written. */
+ if (block_group->start + block_group->alloc_offset == logical + length) {
+ /* Now, release this block group for further allocations. */
+ block_group->zoned_data_reloc_ongoing = 0;
+ }
+
+out:
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index bb1a189e11f9..6b2eec99162b 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
return false;
}
+
+static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index a41ae6efc545..1fee702d5529 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -21,7 +21,8 @@ static int cachefiles_ondemand_fd_release(struct inode *inode,
* anon_fd.
*/
xas_for_each(&xas, req, ULONG_MAX) {
- if (req->msg.opcode == CACHEFILES_OP_READ) {
+ if (req->msg.object_id == object_id &&
+ req->msg.opcode == CACHEFILES_OP_READ) {
req->error = -EIO;
complete(&req->done);
xas_store(&xas, NULL);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6dee88815491..d6e5916138e4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -63,7 +63,7 @@
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata);
+ struct folio **foliop, void **_fsdata);
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
@@ -1288,18 +1288,19 @@ ceph_find_incompatible(struct page *page)
}
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(folio_page(folio, 0));
+ snapc = ceph_find_incompatible(folio_page(*foliop, 0));
if (snapc) {
int r;
- folio_unlock(folio);
- folio_put(folio);
+ folio_unlock(*foliop);
+ folio_put(*foliop);
+ *foliop = NULL;
if (IS_ERR(snapc))
return PTR_ERR(snapc);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 38c930384d41..ac8fd5e7f540 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4377,6 +4377,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
ihold(inode);
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
+ ceph_wait_on_async_create(inode);
ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 1dd995efd5b8..2cfbac8bb965 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -162,6 +162,8 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr);
else if (iface->sockaddr.ss_family == AF_INET6)
seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr);
+ if (!iface->is_active)
+ seq_puts(m, "\t\t[for-cleanup]\n");
}
static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
@@ -221,6 +223,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
+ struct cifs_server_iface *iface;
int c, i, j;
seq_puts(m,
@@ -456,11 +459,10 @@ skip_rdma:
if (ses->iface_count)
seq_printf(m, "\n\n\tServer interfaces: %zu",
ses->iface_count);
- for (j = 0; j < ses->iface_count; j++) {
- struct cifs_server_iface *iface;
-
- iface = &ses->iface_list[j];
- seq_printf(m, "\n\t%d)", j+1);
+ j = 0;
+ list_for_each_entry(iface, &ses->iface_list,
+ iface_head) {
+ seq_printf(m, "\n\t%d)", ++j);
cifs_dump_iface(m, iface);
if (is_ses_using_iface(ses, iface))
seq_puts(m, "\t\t[CONNECTED]\n");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e7737166e5b8..a643c84ff1e9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -80,6 +80,9 @@
#define SMB_DNS_RESOLVE_INTERVAL_MIN 120
#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600
+/* smb multichannel query server interfaces interval in seconds */
+#define SMB_INTERFACE_POLL_INTERVAL 600
+
/* maximum number of PDUs in one compound */
#define MAX_COMPOUND 5
@@ -933,15 +936,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
#endif
struct cifs_server_iface {
+ struct list_head iface_head;
+ struct kref refcount;
size_t speed;
unsigned int rdma_capable : 1;
unsigned int rss_capable : 1;
+ unsigned int is_active : 1; /* unset if non existent */
struct sockaddr_storage sockaddr;
};
+/* release iface when last ref is dropped */
+static inline void
+release_iface(struct kref *ref)
+{
+ struct cifs_server_iface *iface = container_of(ref,
+ struct cifs_server_iface,
+ refcount);
+ list_del_init(&iface->iface_head);
+ kfree(iface);
+}
+
+/*
+ * compare two interfaces a and b
+ * return 0 if everything matches.
+ * return 1 if a has higher link speed, or rdma capable, or rss capable
+ * return -1 otherwise.
+ */
+static inline int
+iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b)
+{
+ int cmp_ret = 0;
+
+ WARN_ON(!a || !b);
+ if (a->speed == b->speed) {
+ if (a->rdma_capable == b->rdma_capable) {
+ if (a->rss_capable == b->rss_capable) {
+ cmp_ret = memcmp(&a->sockaddr, &b->sockaddr,
+ sizeof(a->sockaddr));
+ if (!cmp_ret)
+ return 0;
+ else if (cmp_ret > 0)
+ return 1;
+ else
+ return -1;
+ } else if (a->rss_capable > b->rss_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->rdma_capable > b->rdma_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->speed > b->speed)
+ return 1;
+ else
+ return -1;
+}
+
struct cifs_chan {
unsigned int in_reconnect : 1; /* if session setup in progress for this channel */
struct TCP_Server_Info *server;
+ struct cifs_server_iface *iface; /* interface in use */
__u8 signkey[SMB3_SIGN_KEY_SIZE];
};
@@ -993,7 +1048,7 @@ struct cifs_ses {
*/
spinlock_t iface_lock;
/* ========= begin: protected by iface_lock ======== */
- struct cifs_server_iface *iface_list;
+ struct list_head iface_list;
size_t iface_count;
unsigned long iface_last_update; /* jiffies */
/* ========= end: protected by iface_lock ======== */
@@ -1203,6 +1258,7 @@ struct cifs_tcon {
#ifdef CONFIG_CIFS_DFS_UPCALL
struct list_head ulist; /* cache update list */
#endif
+ struct delayed_work query_interfaces; /* query interfaces workqueue job */
};
/*
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3b7366ec03c7..d59aebefa71c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -636,6 +636,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
bool
cifs_chan_needs_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
+int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1849e3411487..386bb523c69e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -145,6 +145,25 @@ requeue_resolve:
return rc;
}
+static void smb2_query_server_interfaces(struct work_struct *work)
+{
+ int rc;
+ struct cifs_tcon *tcon = container_of(work,
+ struct cifs_tcon,
+ query_interfaces.work);
+
+ /*
+ * query server network interfaces, in case they change
+ */
+ rc = SMB3_request_interfaces(0, tcon);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
+ }
+
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+}
static void cifs_resolve_server(struct work_struct *work)
{
@@ -217,7 +236,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
bool mark_smb_session)
{
struct TCP_Server_Info *pserver;
- struct cifs_ses *ses;
+ struct cifs_ses *ses, *nses;
struct cifs_tcon *tcon;
/*
@@ -231,7 +250,20 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
+ /* check if iface is still active */
+ if (!cifs_chan_is_iface_active(ses, server)) {
+ /*
+ * HACK: drop the lock before calling
+ * cifs_chan_update_iface to avoid deadlock
+ */
+ ses->ses_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ cifs_chan_update_iface(ses, server);
+ spin_lock(&cifs_tcp_ses_lock);
+ ses->ses_count--;
+ }
+
spin_lock(&ses->chan_lock);
if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))
goto next_session;
@@ -1886,7 +1918,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- spin_lock(&ses->chan_lock);
chan_count = ses->chan_count;
/* close any extra channels */
@@ -1894,13 +1925,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
int i;
for (i = 1; i < chan_count; i++) {
- spin_unlock(&ses->chan_lock);
+ if (ses->chans[i].iface) {
+ kref_put(&ses->chans[i].iface->refcount, release_iface);
+ ses->chans[i].iface = NULL;
+ }
cifs_put_tcp_session(ses->chans[i].server, 0);
- spin_lock(&ses->chan_lock);
ses->chans[i].server = NULL;
}
}
- spin_unlock(&ses->chan_lock);
sesInfoFree(ses);
cifs_put_tcp_session(server, 0);
@@ -2270,6 +2302,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
list_del_init(&tcon->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
+ /* cancel polling of interfaces */
+ cancel_delayed_work_sync(&tcon->query_interfaces);
+
if (tcon->use_witness) {
int rc;
@@ -2507,6 +2542,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
+ /* schedule query interfaces poll */
+ INIT_DELAYED_WORK(&tcon->query_interfaces,
+ smb2_query_server_interfaces);
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -3982,10 +4023,16 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+ struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
bool is_binding = false;
-
spin_lock(&cifs_tcp_ses_lock);
+ if (server->dstaddr.ss_family == AF_INET6)
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
+ else
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
+
if (ses->ses_status != SES_GOOD &&
ses->ses_status != SES_NEW &&
ses->ses_status != SES_NEED_RECON) {
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c69e1240d730..0e84e6fcf8ab 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -75,6 +75,7 @@ sesInfoAlloc(void)
INIT_LIST_HEAD(&ret_buf->tcon_list);
mutex_init(&ret_buf->session_mutex);
spin_lock_init(&ret_buf->iface_lock);
+ INIT_LIST_HEAD(&ret_buf->iface_list);
spin_lock_init(&ret_buf->chan_lock);
}
return ret_buf;
@@ -83,6 +84,8 @@ sesInfoAlloc(void)
void
sesInfoFree(struct cifs_ses *buf_to_free)
{
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
+
if (buf_to_free == NULL) {
cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
return;
@@ -96,7 +99,11 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
kfree_sensitive(buf_to_free->auth_key.response);
- kfree(buf_to_free->iface_list);
+ spin_lock(&buf_to_free->iface_lock);
+ list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list,
+ iface_head)
+ kref_put(&iface->refcount, release_iface);
+ spin_unlock(&buf_to_free->iface_lock);
kfree_sensitive(buf_to_free);
}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d417de354d9d..02c8b2906196 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
- if (is_server_using_iface(ses->chans[i].server, iface)) {
+ if (ses->chans[i].iface == iface) {
spin_unlock(&ses->chan_lock);
return true;
}
@@ -146,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
}
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ return ses->chans[chan_index].iface &&
+ ses->chans[chan_index].iface->is_active;
+}
+
/* returns number of channels added */
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
{
int old_chan_count, new_chan_count;
int left;
- int i = 0;
int rc = 0;
int tries = 0;
- struct cifs_server_iface *ifaces = NULL;
- size_t iface_count;
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
spin_lock(&ses->chan_lock);
@@ -185,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
spin_unlock(&ses->chan_lock);
/*
- * Make a copy of the iface list at the time and use that
- * instead so as to not hold the iface spinlock for opening
- * channels
- */
- spin_lock(&ses->iface_lock);
- iface_count = ses->iface_count;
- if (iface_count <= 0) {
- spin_unlock(&ses->iface_lock);
- cifs_dbg(VFS, "no iface list available to open channels\n");
- return 0;
- }
- ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces),
- GFP_ATOMIC);
- if (!ifaces) {
- spin_unlock(&ses->iface_lock);
- return 0;
- }
- spin_unlock(&ses->iface_lock);
-
- /*
* Keep connecting to same, fastest, iface for all channels as
* long as its RSS. Try next fastest one if not RSS or channel
* creation fails.
*/
+ spin_lock(&ses->iface_lock);
+ iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ spin_unlock(&ses->iface_lock);
+
while (left > 0) {
- struct cifs_server_iface *iface;
tries++;
if (tries > 3*ses->chan_max) {
@@ -219,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
break;
}
- iface = &ifaces[i];
- if (is_ses_using_iface(ses, iface) && !iface->rss_capable) {
- i = (i+1) % iface_count;
- continue;
+ spin_lock(&ses->iface_lock);
+ if (!ses->iface_count) {
+ spin_unlock(&ses->iface_lock);
+ break;
}
- rc = cifs_ses_add_channel(cifs_sb, ses, iface);
- if (rc) {
- cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n",
- i, rc);
- i = (i+1) % iface_count;
- continue;
+ list_for_each_entry_safe_from(iface, niface, &ses->iface_list,
+ iface_head) {
+ /* skip ifaces that are unusable */
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+
+ /* take ref before unlock */
+ kref_get(&iface->refcount);
+
+ spin_unlock(&ses->iface_lock);
+ rc = cifs_ses_add_channel(cifs_sb, ses, iface);
+ spin_lock(&ses->iface_lock);
+
+ if (rc) {
+ cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n",
+ &iface->sockaddr,
+ rc);
+ kref_put(&iface->refcount, release_iface);
+ continue;
+ }
+
+ cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n",
+ &iface->sockaddr);
+ break;
}
+ spin_unlock(&ses->iface_lock);
- cifs_dbg(FYI, "successfully opened new channel on iface#%d\n",
- i);
left--;
new_chan_count++;
}
- kfree(ifaces);
return new_chan_count - old_chan_count;
}
/*
+ * update the iface for the channel if necessary.
+ * will return 0 when iface is updated, 1 if removed, 2 otherwise
+ * Must be called with chan_lock held.
+ */
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
+{
+ unsigned int chan_index;
+ struct cifs_server_iface *iface = NULL;
+ struct cifs_server_iface *old_iface = NULL;
+ int rc = 0;
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (!chan_index) {
+ spin_unlock(&ses->chan_lock);
+ return 0;
+ }
+
+ if (ses->chans[chan_index].iface) {
+ old_iface = ses->chans[chan_index].iface;
+ if (old_iface->is_active) {
+ spin_unlock(&ses->chan_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&ses->chan_lock);
+
+ spin_lock(&ses->iface_lock);
+ /* then look for a new one */
+ list_for_each_entry(iface, &ses->iface_list, iface_head) {
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+ kref_get(&iface->refcount);
+ }
+
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ rc = 1;
+ iface = NULL;
+ cifs_dbg(FYI, "unable to find a suitable iface\n");
+ }
+
+ /* now drop the ref to the current iface */
+ if (old_iface && iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
+ &old_iface->sockaddr,
+ &iface->sockaddr);
+ } else if (old_iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
+ &old_iface->sockaddr);
+ } else {
+ WARN_ON(!iface);
+ cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
+ }
+ spin_unlock(&ses->iface_lock);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ ses->chans[chan_index].iface = iface;
+
+ /* No iface is found. if secondary chan, drop connection */
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ ses->chans[chan_index].server = NULL;
+
+ spin_unlock(&ses->chan_lock);
+
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ cifs_put_tcp_session(server, false);
+
+ return rc;
+}
+
+/*
* If server is a channel of ses, return the corresponding enclosing
* cifs_chan otherwise return NULL.
*/
@@ -355,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
spin_unlock(&ses->chan_lock);
goto out;
}
+ chan->iface = iface;
ses->chan_count++;
atomic_set(&ses->chan_seq, 0);
@@ -384,6 +474,14 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
out:
if (rc && chan->server) {
+ /*
+ * we should avoid race with these delayed works before we
+ * remove this channel
+ */
+ cancel_delayed_work_sync(&chan->server->echo);
+ cancel_delayed_work_sync(&chan->server->resolve);
+ cancel_delayed_work_sync(&chan->server->reconnect);
+
spin_lock(&ses->chan_lock);
/* we rely on all bits beyond chan_count to be clear */
cifs_chan_clear_need_reconnect(ses, chan->server);
@@ -394,10 +492,9 @@ out:
*/
WARN_ON(ses->chan_count < 1);
spin_unlock(&ses->chan_lock);
- }
- if (rc && chan->server)
cifs_put_tcp_session(chan->server, 0);
+ }
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 8543cafdfd34..8802995b2d3d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
size_t buf_len,
- struct cifs_server_iface **iface_list,
- size_t *iface_count)
+ struct cifs_ses *ses)
{
struct network_interface_info_ioctl_rsp *p;
struct sockaddr_in *addr4;
struct sockaddr_in6 *addr6;
struct iface_info_ipv4 *p4;
struct iface_info_ipv6 *p6;
- struct cifs_server_iface *info;
+ struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL;
+ struct cifs_server_iface tmp_iface;
ssize_t bytes_left;
size_t next = 0;
int nb_iface = 0;
- int rc = 0;
-
- *iface_list = NULL;
- *iface_count = 0;
-
- /*
- * Fist pass: count and sanity check
- */
+ int rc = 0, ret = 0;
bytes_left = buf_len;
p = buf;
- while (bytes_left >= sizeof(*p)) {
- nb_iface++;
- next = le32_to_cpu(p->Next);
- if (!next) {
- bytes_left -= sizeof(*p);
- break;
- }
- p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
- bytes_left -= next;
- }
-
- if (!nb_iface) {
- cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
- rc = -EINVAL;
- goto out;
- }
-
- /* Azure rounds the buffer size up 8, to a 16 byte boundary */
- if ((bytes_left > 8) || p->Next)
- cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
+ spin_lock(&ses->iface_lock);
/*
- * Second pass: extract info to internal structure
+ * Go through iface_list and do kref_put to remove
+ * any unused ifaces. ifaces in use will be removed
+ * when the last user calls a kref_put on it
*/
-
- *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL);
- if (!*iface_list) {
- rc = -ENOMEM;
- goto out;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ iface->is_active = 0;
+ kref_put(&iface->refcount, release_iface);
}
+ spin_unlock(&ses->iface_lock);
- info = *iface_list;
- bytes_left = buf_len;
- p = buf;
while (bytes_left >= sizeof(*p)) {
- info->speed = le64_to_cpu(p->LinkSpeed);
- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
-
- cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
- cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
- cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
- le32_to_cpu(p->Capability));
+ memset(&tmp_iface, 0, sizeof(tmp_iface));
+ tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
+ tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
+ tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
switch (p->Family) {
/*
@@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
* conversion explicit in case either one changes.
*/
case INTERNETWORK:
- addr4 = (struct sockaddr_in *)&info->sockaddr;
+ addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr;
p4 = (struct iface_info_ipv4 *)p->Buffer;
addr4->sin_family = AF_INET;
memcpy(&addr4->sin_addr, &p4->IPv4Address, 4);
@@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
&addr4->sin_addr);
break;
case INTERNETWORKV6:
- addr6 = (struct sockaddr_in6 *)&info->sockaddr;
+ addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr;
p6 = (struct iface_info_ipv6 *)p->Buffer;
addr6->sin6_family = AF_INET6;
memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16);
@@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
goto next_iface;
}
- (*iface_count)++;
- info++;
+ /*
+ * The iface_list is assumed to be sorted by speed.
+ * Check if the new interface exists in that list.
+ * NEVER change iface. it could be in use.
+ * Add a new one instead
+ */
+ spin_lock(&ses->iface_lock);
+ iface = niface = NULL;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ ret = iface_cmp(iface, &tmp_iface);
+ if (!ret) {
+ /* just get a ref so that it doesn't get picked/freed */
+ iface->is_active = 1;
+ kref_get(&iface->refcount);
+ spin_unlock(&ses->iface_lock);
+ goto next_iface;
+ } else if (ret < 0) {
+ /* all remaining ifaces are slower */
+ kref_get(&iface->refcount);
+ break;
+ }
+ }
+ spin_unlock(&ses->iface_lock);
+
+ /* no match. insert the entry in the list */
+ info = kmalloc(sizeof(struct cifs_server_iface),
+ GFP_KERNEL);
+ if (!info) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(info, &tmp_iface, sizeof(tmp_iface));
+
+ /* add this new entry to the list */
+ kref_init(&info->refcount);
+ info->is_active = 1;
+
+ cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count);
+ cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
+ cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
+ le32_to_cpu(p->Capability));
+
+ spin_lock(&ses->iface_lock);
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_add_tail(&info->iface_head, &iface->iface_head);
+ kref_put(&iface->refcount, release_iface);
+ } else
+ list_add_tail(&info->iface_head, &ses->iface_list);
+ spin_unlock(&ses->iface_lock);
+
+ ses->iface_count++;
+ ses->iface_last_update = jiffies;
next_iface:
+ nb_iface++;
next = le32_to_cpu(p->Next);
- if (!next)
+ if (!next) {
+ bytes_left -= sizeof(*p);
break;
+ }
p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
bytes_left -= next;
}
- if (!*iface_count) {
+ if (!nb_iface) {
+ cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
rc = -EINVAL;
goto out;
}
-out:
- if (rc) {
- kfree(*iface_list);
- *iface_count = 0;
- *iface_list = NULL;
- }
- return rc;
-}
+ /* Azure rounds the buffer size up 8, to a 16 byte boundary */
+ if ((bytes_left > 8) || p->Next)
+ cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-static int compare_iface(const void *ia, const void *ib)
-{
- const struct cifs_server_iface *a = (struct cifs_server_iface *)ia;
- const struct cifs_server_iface *b = (struct cifs_server_iface *)ib;
- return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1);
+ if (!ses->iface_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+out:
+ return rc;
}
-static int
+int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
{
int rc;
unsigned int ret_data_len = 0;
struct network_interface_info_ioctl_rsp *out_buf = NULL;
- struct cifs_server_iface *iface_list;
- size_t iface_count;
struct cifs_ses *ses = tcon->ses;
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
@@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
goto out;
}
- rc = parse_server_interfaces(out_buf, ret_data_len,
- &iface_list, &iface_count);
+ rc = parse_server_interfaces(out_buf, ret_data_len, ses);
if (rc)
goto out;
- /* sort interfaces from fastest to slowest */
- sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL);
-
- spin_lock(&ses->iface_lock);
- kfree(ses->iface_list);
- ses->iface_list = iface_list;
- ses->iface_count = iface_count;
- ses->iface_last_update = jiffies;
- spin_unlock(&ses->iface_lock);
-
out:
kfree(out_buf);
return rc;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b515140bad8d..c705de32e225 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -543,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
struct TCP_Server_Info *server, unsigned int *total_len)
{
char *pneg_ctxt;
+ char *hostname = NULL;
unsigned int ctxt_len, neg_context_count;
if (*total_len > 200) {
@@ -570,16 +571,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
- ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
- server->hostname);
- *total_len += ctxt_len;
- pneg_ctxt += ctxt_len;
+ /*
+ * secondary channels don't have the hostname field populated
+ * use the hostname field in the primary channel instead
+ */
+ hostname = CIFS_SERVER_IS_CHAN(server) ?
+ server->primary_server->hostname : server->hostname;
+ if (hostname && (hostname[0] != 0)) {
+ ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
+ hostname);
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+ neg_context_count = 3;
+ } else
+ neg_context_count = 2;
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
pneg_ctxt += sizeof(struct smb2_posix_neg_context);
-
- neg_context_count = 4;
+ neg_context_count++;
if (server->compress_algorithm) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
diff --git a/fs/exec.c b/fs/exec.c
index 0989fb8472a1..778123259e42 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1301,7 +1301,7 @@ int begin_new_exec(struct linux_binprm * bprm)
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
- exit_itimers(me->signal);
+ exit_itimers(me);
flush_itimer_signals();
#endif
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 76acc3721951..c6eaf7e9ea74 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -1198,7 +1198,9 @@ static int __exfat_rename(struct inode *old_parent_inode,
return -ENOENT;
}
- exfat_chain_dup(&olddir, &ei->dir);
+ exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi),
+ EXFAT_I(old_parent_inode)->flags);
dentry = ei->entry;
ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index be599f31d3c4..d84c5f6cc09d 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
unsigned int cnt;
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+ unsigned long flags;
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
for (io = 0; io < NR_PAGE_TYPE; io++) {
cnt = io_lat->bio_cnt[idx][io];
@@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
io_lat->bio_cnt[idx][io] = 0;
}
}
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
trace_f2fs_iostat_latency(sbi, iostat_lat);
}
@@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
{
unsigned long long iostat_diff[NR_IO_TYPE];
int i;
+ unsigned long flags;
if (time_is_after_jiffies(sbi->iostat_next_period))
return;
/* Need double check under the lock */
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
if (time_is_after_jiffies(sbi->iostat_next_period)) {
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
return;
}
sbi->iostat_next_period = jiffies +
@@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
sbi->prev_rw_iostat[i];
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
trace_f2fs_iostat(sbi, iostat_diff);
@@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int i;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irq(&sbi->iostat_lock);
for (i = 0; i < NR_IO_TYPE; i++) {
sbi->rw_iostat[i] = 0;
sbi->prev_rw_iostat[i] = 0;
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irq(&sbi->iostat_lock);
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irq(&sbi->iostat_lat_lock);
memset(io_lat, 0, sizeof(struct iostat_lat_info));
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irq(&sbi->iostat_lat_lock);
}
void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes)
{
+ unsigned long flags;
+
if (!sbi->iostat_enable)
return;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
sbi->rw_iostat[type] += io_bytes;
if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO)
@@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,
if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
sbi->rw_iostat[APP_READ_IO] += io_bytes;
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
f2fs_record_iostat(sbi);
}
@@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int idx;
+ unsigned long flags;
if (!sbi->iostat_enable)
return;
@@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
idx = WRITE_ASYNC_IO;
}
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
io_lat->sum_lat[idx][iotype] += ts_diff;
io_lat->bio_cnt[idx][iotype]++;
if (ts_diff > io_lat->peak_lat[idx][iotype])
io_lat->peak_lat[idx][iotype] = ts_diff;
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
}
void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index c549acb52ac4..bf00d5057abb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -89,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
if (test_opt(sbi, INLINE_XATTR))
set_inode_flag(inode, FI_INLINE_XATTR);
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(inode, FI_INLINE_DENTRY);
@@ -107,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
f2fs_init_extent_tree(inode, NULL);
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
F2FS_I(inode)->i_flags =
f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
@@ -127,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
set_compress_context(inode);
}
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
f2fs_set_inode_flags(inode);
trace_f2fs_new_inode(inode, 0);
@@ -325,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
if (!is_extension_exist(name, ext[i], false))
continue;
+ /* Do not use inline_data with compression */
+ stat_dec_inline_inode(inode);
+ clear_inode_flag(inode, FI_INLINE_DATA);
set_compress_context(inode);
return;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 836c79a20afc..cf6f7fc83c08 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1450,7 +1450,9 @@ page_hit:
out_err:
ClearPageUptodate(page);
out_put_err:
- f2fs_handle_page_eio(sbi, page->index, NODE);
+ /* ENOENT comes from read_node_page which is not an error. */
+ if (err != -ENOENT)
+ f2fs_handle_page_eio(sbi, page->index, NODE);
f2fs_put_page(page, 1);
return ERR_PTR(err);
}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 9d3cf0111709..74920826d8f6 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -372,17 +372,22 @@ nomem:
return NULL;
}
+static inline bool fscache_cookie_is_dropped(struct fscache_cookie *cookie)
+{
+ return READ_ONCE(cookie->state) == FSCACHE_COOKIE_STATE_DROPPED;
+}
+
static void fscache_wait_on_collision(struct fscache_cookie *candidate,
struct fscache_cookie *wait_for)
{
enum fscache_cookie_state *statep = &wait_for->state;
- wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED,
+ wait_var_event_timeout(statep, fscache_cookie_is_dropped(wait_for),
20 * HZ);
- if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) {
+ if (!fscache_cookie_is_dropped(wait_for)) {
pr_notice("Potential collision c=%08x old: c=%08x",
candidate->debug_id, wait_for->debug_id);
- wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED);
+ wait_var_event(statep, fscache_cookie_is_dropped(wait_for));
}
}
@@ -517,7 +522,14 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie)
}
fscache_see_cookie(cookie, fscache_cookie_see_active);
- fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+ spin_lock(&cookie->lock);
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_INVALIDATING);
+ else
+ __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+ spin_unlock(&cookie->lock);
+ wake_up_cookie_state(cookie);
trace = fscache_access_lookup_cookie_end;
out:
@@ -752,6 +764,9 @@ again_locked:
spin_lock(&cookie->lock);
}
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end);
+
switch (state) {
case FSCACHE_COOKIE_STATE_RELINQUISHING:
fscache_see_cookie(cookie, fscache_cookie_see_relinquish);
@@ -1048,6 +1063,9 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
return;
case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie);
+ set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags);
+ fallthrough;
case FSCACHE_COOKIE_STATE_CREATING:
spin_unlock(&cookie->lock);
_leave(" [look %x]", cookie->inval_counter);
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c
index f2aa7dbad766..a058e0136bfe 100644
--- a/fs/fscache/volume.c
+++ b/fs/fscache/volume.c
@@ -143,7 +143,7 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate,
{
wait_var_event_timeout(&candidate->flags,
!fscache_is_acquire_pending(candidate), 20 * HZ);
- if (!fscache_is_acquire_pending(candidate)) {
+ if (fscache_is_acquire_pending(candidate)) {
pr_notice("Potential volume collision new=%08x old=%08x",
candidate->debug_id, collidee_debug_id);
fscache_stat(&fscache_n_volumes_collision);
@@ -182,7 +182,7 @@ static bool fscache_hash_volume(struct fscache_volume *candidate)
hlist_bl_add_head(&candidate->hash_link, h);
hlist_bl_unlock(h);
- if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags))
+ if (fscache_is_acquire_pending(candidate))
fscache_wait_on_volume_collision(candidate, collidee_debug_id);
return true;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 62408047e8d7..02eb72351b15 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -600,41 +600,79 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
remove_inode_hugepages(inode, offset, LLONG_MAX);
}
+static void hugetlbfs_zero_partial_page(struct hstate *h,
+ struct address_space *mapping,
+ loff_t start,
+ loff_t end)
+{
+ pgoff_t idx = start >> huge_page_shift(h);
+ struct folio *folio;
+
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio)
+ return;
+
+ start = start & ~huge_page_mask(h);
+ end = end & ~huge_page_mask(h);
+ if (!end)
+ end = huge_page_size(h);
+
+ folio_zero_segment(folio, (size_t)start, (size_t)end);
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
loff_t hpage_size = huge_page_size(h);
loff_t hole_start, hole_end;
/*
- * For hole punch round up the beginning offset of the hole and
- * round down the end.
+ * hole_start and hole_end indicate the full pages within the hole.
*/
hole_start = round_up(offset, hpage_size);
hole_end = round_down(offset + len, hpage_size);
- if (hole_end > hole_start) {
- struct address_space *mapping = inode->i_mapping;
- struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ inode_lock(inode);
- inode_lock(inode);
+ /* protected by i_rwsem */
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ inode_unlock(inode);
+ return -EPERM;
+ }
- /* protected by i_rwsem */
- if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
- inode_unlock(inode);
- return -EPERM;
- }
+ i_mmap_lock_write(mapping);
+
+ /* If range starts before first full page, zero partial page. */
+ if (offset < hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ offset, min(offset + len, hole_start));
- i_mmap_lock_write(mapping);
+ /* Unmap users of full pages in the hole. */
+ if (hole_end > hole_start) {
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT, 0);
- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
- inode_unlock(inode);
}
+ /* If range extends beyond last full page, zero partial page. */
+ if ((offset + len) > hole_end && (offset + len) > hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ hole_end, offset + len);
+
+ i_mmap_unlock_write(mapping);
+
+ /* Remove full pages from the file. */
+ if (hole_end > hole_start)
+ remove_inode_hugepages(inode, hole_start, hole_end);
+
+ inode_unlock(inode);
+
return 0;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d3ee4fc532fa..a01ea49f3017 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1183,6 +1183,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_setup = 1,
+ .ioprio = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_RECVMSG] = {
@@ -1191,6 +1192,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.needs_async_setup = 1,
+ .ioprio = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
@@ -1266,6 +1268,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.audit_skip = 1,
+ .ioprio = 1,
},
[IORING_OP_RECV] = {
.needs_file = 1,
@@ -1273,6 +1276,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.audit_skip = 1,
+ .ioprio = 1,
},
[IORING_OP_OPENAT2] = {
},
@@ -1975,7 +1979,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&current->io_uring->inflight_tracked);
+ atomic_inc(&req->task->io_uring->inflight_tracked);
}
}
@@ -3437,7 +3441,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
if (unlikely(res != req->cqe.res)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
+ req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
return true;
}
req_set_fail(req);
@@ -3487,7 +3491,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
kiocb_end_write(req);
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
+ req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
return;
}
req->cqe.res = res;
@@ -4314,18 +4318,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(ret < 0))
return ret;
} else {
+ rw = req->async_data;
+ s = &rw->s;
+
/*
* Safe and required to re-import if we're using provided
* buffers, as we dropped the selected one before retry.
*/
- if (req->flags & REQ_F_BUFFER_SELECT) {
+ if (io_do_buffer_select(req)) {
ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
}
- rw = req->async_data;
- s = &rw->s;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
@@ -5061,7 +5066,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
{
struct io_uring_cmd *ioucmd = &req->uring_cmd;
- if (sqe->rw_flags)
+ if (sqe->rw_flags || sqe->__pad1)
return -EINVAL;
ioucmd->cmd = sqe->cmd;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
@@ -6075,14 +6080,12 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(sqe->file_index))
- return -EINVAL;
- if (unlikely(sqe->addr2 || sqe->file_index))
+ if (unlikely(sqe->file_index || sqe->addr2))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
- sr->flags = READ_ONCE(sqe->addr2);
+ sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -6313,14 +6316,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(sqe->file_index))
- return -EINVAL;
- if (unlikely(sqe->addr2 || sqe->file_index))
+ if (unlikely(sqe->file_index || sqe->addr2))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
- sr->flags = READ_ONCE(sqe->addr2);
+ sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -6954,7 +6955,8 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
+static void __io_poll_execute(struct io_kiocb *req, int mask,
+ __poll_t __maybe_unused events)
{
req->cqe.res = mask;
/*
@@ -6963,7 +6965,6 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
* CPU. We want to avoid pulling in req->apoll->events for that
* case.
*/
- req->apoll_events = events;
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
@@ -7114,6 +7115,8 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
+ req->apoll_events = poll->events;
+
ipt->pt._key = mask;
ipt->req = req;
ipt->error = 0;
@@ -7144,8 +7147,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (mask) {
/* can't multishot if failed, just queue the event we've got */
- if (unlikely(ipt->error || !ipt->nr_entries))
+ if (unlikely(ipt->error || !ipt->nr_entries)) {
poll->events |= EPOLLONESHOT;
+ req->apoll_events |= EPOLLONESHOT;
+ ipt->error = 0;
+ }
__io_poll_execute(req, mask, poll->events);
return 0;
}
@@ -7207,6 +7213,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
mask |= EPOLLEXCLUSIVE;
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
+ kfree(apoll->double_poll);
} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
!list_empty(&ctx->apoll_cache)) {
apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
@@ -7392,7 +7399,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
io_req_set_refcount(req);
- req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
+ poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -7405,6 +7412,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ipt.pt._qproc = io_poll_queue_proc;
ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
+ if (!ret && ipt.error)
+ req_set_fail(req);
ret = ret ?: ipt.error;
if (ret)
__io_req_complete(req, issue_flags, ret, 0);
@@ -7964,6 +7973,9 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
struct file *file;
int ret, fd;
+ if (!req->ctx->file_data)
+ return -ENXIO;
+
for (done = 0; done < req->rsrc_update.nr_args; done++) {
if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
ret = -EFAULT;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index e6f4ccc12f49..353f047e783c 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -6490,6 +6490,7 @@ int smb2_write(struct ksmbd_work *work)
goto out;
}
+ ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
writethrough = true;
@@ -6505,10 +6506,6 @@ int smb2_write(struct ksmbd_work *work)
data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
le16_to_cpu(req->DataOffset));
- ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
- if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
- writethrough = true;
-
ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
fp->filp->f_path.dentry, offset, length);
err = ksmbd_vfs_write(work, fp, data_buf, length, &offset,
@@ -7703,7 +7700,7 @@ int smb2_ioctl(struct ksmbd_work *work)
{
struct file_zero_data_information *zero_data;
struct ksmbd_file *fp;
- loff_t off, len;
+ loff_t off, len, bfz;
if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
ksmbd_debug(SMB,
@@ -7720,19 +7717,26 @@ int smb2_ioctl(struct ksmbd_work *work)
zero_data =
(struct file_zero_data_information *)&req->Buffer[0];
- fp = ksmbd_lookup_fd_fast(work, id);
- if (!fp) {
- ret = -ENOENT;
+ off = le64_to_cpu(zero_data->FileOffset);
+ bfz = le64_to_cpu(zero_data->BeyondFinalZero);
+ if (off > bfz) {
+ ret = -EINVAL;
goto out;
}
- off = le64_to_cpu(zero_data->FileOffset);
- len = le64_to_cpu(zero_data->BeyondFinalZero) - off;
+ len = bfz - off;
+ if (len) {
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp) {
+ ret = -ENOENT;
+ goto out;
+ }
- ret = ksmbd_vfs_zero_data(work, fp, off, len);
- ksmbd_fd_put(work, fp);
- if (ret < 0)
- goto out;
+ ret = ksmbd_vfs_zero_data(work, fp, off, len);
+ ksmbd_fd_put(work, fp);
+ if (ret < 0)
+ goto out;
+ }
break;
}
case FSCTL_QUERY_ALLOCATED_RANGES:
@@ -7806,14 +7810,24 @@ int smb2_ioctl(struct ksmbd_work *work)
src_off = le64_to_cpu(dup_ext->SourceFileOffset);
dst_off = le64_to_cpu(dup_ext->TargetFileOffset);
length = le64_to_cpu(dup_ext->ByteCount);
- cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp,
- dst_off, length, 0);
+ /*
+ * XXX: It is not clear if FSCTL_DUPLICATE_EXTENTS_TO_FILE
+ * should fall back to vfs_copy_file_range(). This could be
+ * beneficial when re-exporting nfs/smb mount, but note that
+ * this can result in partial copy that returns an error status.
+ * If/when FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX is implemented,
+ * fall back to vfs_copy_file_range(), should be avoided when
+ * the flag DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC is set.
+ */
+ cloned = vfs_clone_file_range(fp_in->filp, src_off,
+ fp_out->filp, dst_off, length, 0);
if (cloned == -EXDEV || cloned == -EOPNOTSUPP) {
ret = -EOPNOTSUPP;
goto dup_ext_out;
} else if (cloned != length) {
cloned = vfs_copy_file_range(fp_in->filp, src_off,
- fp_out->filp, dst_off, length, 0);
+ fp_out->filp, dst_off,
+ length, 0);
if (cloned != length) {
if (cloned < 0)
ret = cloned;
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index d035e060c2f0..35b55ee94fe5 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -5,16 +5,6 @@
*
* Author(s): Long Li <longli@microsoft.com>,
* Hyunchul Lee <hyc.lee@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define SUBMOD_NAME "smb_direct"
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 8fef9de787d3..143bba4e4db8 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -230,7 +230,7 @@ static int ksmbd_kthread_fn(void *p)
break;
}
ret = kernel_accept(iface->ksmbd_socket, &client_sk,
- O_NONBLOCK);
+ SOCK_NONBLOCK);
mutex_unlock(&iface->sock_release_lock);
if (ret) {
if (ret == -EAGAIN)
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index dcdd07c6efff..05efcdf7a4a7 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1015,7 +1015,9 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
off, len);
- return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len);
+ return vfs_fallocate(fp->filp,
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+ off, len);
}
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
@@ -1046,7 +1048,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
*out_count = 0;
end = start + length;
while (start < end && *out_count < in_count) {
- extent_start = f->f_op->llseek(f, start, SEEK_DATA);
+ extent_start = vfs_llseek(f, start, SEEK_DATA);
if (extent_start < 0) {
if (extent_start != -ENXIO)
ret = (int)extent_start;
@@ -1056,7 +1058,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
if (extent_start >= end)
break;
- extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE);
+ extent_end = vfs_llseek(f, extent_start, SEEK_HOLE);
if (extent_end < 0) {
if (extent_end != -ENXIO)
ret = (int)extent_end;
@@ -1777,6 +1779,10 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
ret = vfs_copy_file_range(src_fp->filp, src_off,
dst_fp->filp, dst_off, len, 0);
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(src_fp->filp, src_off,
+ dst_fp->filp, dst_off,
+ len, 0);
if (ret < 0)
return ret;
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 0a22a2faf552..e1c4617de771 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file)
}
}
-static int nlm_unlock_files(struct nlm_file *file)
+static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
{
struct file_lock lock;
@@ -184,6 +184,7 @@ static int nlm_unlock_files(struct nlm_file *file)
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
+ lock.fl_owner = owner;
if (file->f_file[O_RDONLY] &&
vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
goto out_err;
@@ -225,7 +226,7 @@ again:
if (match(lockhost, host)) {
spin_unlock(&flctx->flc_lock);
- if (nlm_unlock_files(file))
+ if (nlm_unlock_files(file, fl->fl_owner))
return 1;
goto again;
}
@@ -282,11 +283,10 @@ nlm_file_inuse(struct nlm_file *file)
static void nlm_close_files(struct nlm_file *file)
{
- struct file *f;
-
- for (f = file->f_file[0]; f <= file->f_file[1]; f++)
- if (f)
- nlmsvc_ops->fclose(f);
+ if (file->f_file[O_RDONLY])
+ nlmsvc_ops->fclose(file->f_file[O_RDONLY]);
+ if (file->f_file[O_WRONLY])
+ nlmsvc_ops->fclose(file->f_file[O_WRONLY]);
}
/*
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 42f892c5712e..0ce535852151 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -319,8 +319,9 @@ zero_out:
* conflicting writes once the folio is grabbed and locked. It is passed a
* pointer to the fsdata cookie that gets returned to the VM to be passed to
* write_end. It is permitted to sleep. It should return 0 if the request
- * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
- * be regot; or return an error.
+ * should go ahead or it may return an error. It may also unlock and put the
+ * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
+ * will cause the folio to be re-got and the process to be retried.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
@@ -348,13 +349,13 @@ retry:
if (ctx->ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
- ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
+ ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
if (ret < 0) {
trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
- if (ret == -EAGAIN)
- goto retry;
goto error;
}
+ if (!folio)
+ goto retry;
}
if (folio_test_uptodate(folio))
@@ -416,8 +417,10 @@ have_folio_no_wait:
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
error:
- folio_unlock(folio);
- folio_put(folio);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c0fdcf8c0032..bb0e84a46d61 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4012,22 +4012,29 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
}
page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
- if (page == NULL || locations == NULL)
- goto out;
+ if (!locations)
+ goto out_free;
+ locations->fattr = nfs_alloc_fattr();
+ if (!locations->fattr)
+ goto out_free_2;
status = nfs4_proc_get_locations(server, fhandle, locations, page,
cred);
if (status)
- goto out;
+ goto out_free_3;
for (i = 0; i < locations->nlocations; i++)
test_fs_location_for_trunking(&locations->locations[i], clp,
server);
-out:
- if (page)
- __free_page(page);
+out_free_3:
+ kfree(locations->fattr);
+out_free_2:
kfree(locations);
+out_free:
+ __free_page(page);
return status;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2540b35ec187..9bab3e9c702a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2753,5 +2753,6 @@ again:
goto again;
nfs_put_client(clp);
+ module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 61b2aae81abb..2acea7792bb2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -470,6 +470,15 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
return nfserr_bad_xdr;
}
}
+ if (bmval[1] & FATTR4_WORD1_TIME_CREATE) {
+ struct timespec64 ts;
+
+ /* No Linux filesystem supports setting this attribute. */
+ bmval[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ status = nfsd4_decode_nfstime4(argp, &ts);
+ if (status)
+ return status;
+ }
if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
u32 set_it;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 847b482155ae..9a8b09afc173 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -465,7 +465,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
#define NFSD_WRITEABLE_ATTRS_WORD1 \
(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \
+ | FATTR4_WORD1_TIME_MODIFY_SET)
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \
FATTR4_WORD2_SECURITY_LABEL
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 840e3af63a6f..d79db56475d4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -577,6 +577,7 @@ out_err:
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count)
{
+ ssize_t ret;
/*
* Limit copy to 4MB to prevent indefinitely blocking an nfsd
@@ -587,7 +588,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
* limit like this and pipeline multiple COPY requests.
*/
count = min_t(u64, count, 1 << 22);
- return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+ ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
+ count, 0);
+ return ret;
}
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1173,6 +1179,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
nfsd_copy_write_verifier(verf, nn);
err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
since);
+ err = nfserrno(err2);
break;
case -EINVAL:
err = nfserr_notsupp;
@@ -1180,8 +1187,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
default:
nfsd_reset_write_verifier(nn);
trace_nfsd_writeverf_reset(nn, rqstp, err2);
+ err = nfserrno(err2);
}
- err = nfserrno(err2);
} else
nfsd_copy_write_verifier(verf, nn);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 1344f7d475d3..aecda4fc95f5 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -198,6 +198,9 @@ static inline int nilfs_acl_chmod(struct inode *inode)
static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
{
+ if (S_ISLNK(inode->i_mode))
+ return 0;
+
inode->i_mode &= ~current_umask();
return 0;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c2255b440df9..b08ce0d821a7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1513,8 +1513,15 @@ static int fanotify_test_fid(struct dentry *dentry)
return 0;
}
-static int fanotify_events_supported(struct path *path, __u64 mask)
+static int fanotify_events_supported(struct fsnotify_group *group,
+ struct path *path, __u64 mask,
+ unsigned int flags)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ /* Strict validation of events in non-dir inode mask with v5.17+ APIs */
+ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
+ (mask & FAN_RENAME);
+
/*
* Some filesystems such as 'proc' acquire unusual locks when opening
* files. For them fanotify permission events have high chances of
@@ -1526,6 +1533,16 @@ static int fanotify_events_supported(struct path *path, __u64 mask)
if (mask & FANOTIFY_PERM_EVENTS &&
path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
return -EINVAL;
+
+ /*
+ * We shouldn't have allowed setting dirent events and the directory
+ * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
+ * but because we always allowed it, error only when using new APIs.
+ */
+ if (strict_dir_events && mark_type == FAN_MARK_INODE &&
+ !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
+ return -ENOTDIR;
+
return 0;
}
@@ -1672,7 +1689,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
if (flags & FAN_MARK_ADD) {
- ret = fanotify_events_supported(&path, mask);
+ ret = fanotify_events_supported(group, &path, mask, flags);
if (ret)
goto path_put_and_out;
}
@@ -1695,19 +1712,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
else
mnt = path.mnt;
- /*
- * FAN_RENAME is not allowed on non-dir (for now).
- * We shouldn't have allowed setting any dirent events in mask of
- * non-dir, but because we always allowed it, error only if group
- * was initialized with the new flag FAN_REPORT_TARGET_FID.
- */
- ret = -ENOTDIR;
- if (inode && !S_ISDIR(inode->i_mode) &&
- ((mask & FAN_RENAME) ||
- ((mask & FANOTIFY_DIRENT_EVENTS) &&
- FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID))))
- goto path_put_and_out;
-
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e0a2e0468ee7..1ce5c9698393 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1003,6 +1003,9 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
{
+ if (!IS_POSIXACL(inode))
+ return -EOPNOTSUPP;
+
return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
}
@@ -1018,6 +1021,9 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
struct posix_acl *acl = NULL;
int err;
+ if (!IS_POSIXACL(inode))
+ return -EOPNOTSUPP;
+
/* Check that everything is OK before copy-up */
if (value) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
@@ -1960,6 +1966,20 @@ static struct dentry *ovl_get_root(struct super_block *sb,
return root;
}
+static bool ovl_has_idmapped_layers(struct ovl_fs *ofs)
+{
+
+ unsigned int i;
+ const struct vfsmount *mnt;
+
+ for (i = 0; i < ofs->numlayer; i++) {
+ mnt = ofs->layers[i].mnt;
+ if (mnt && is_idmapped_mnt(mnt))
+ return true;
+ }
+ return false;
+}
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { };
@@ -2129,7 +2149,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
- sb->s_flags |= SB_POSIXACL;
+ if (ovl_has_idmapped_layers(ofs))
+ pr_warn("POSIX ACLs are not yet supported with idmapped layers, mounting without ACL support.\n");
+ else
+ sb->s_flags |= SB_POSIXACL;
sb->s_iflags |= SB_I_SKIP_SYNC;
err = -ENOMEM;
diff --git a/fs/read_write.c b/fs/read_write.c
index b1b1cdfee9d3..e0777eefd846 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1397,28 +1397,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(generic_copy_file_range);
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- /*
- * Although we now allow filesystems to handle cross sb copy, passing
- * a file of the wrong filesystem type to filesystem driver can result
- * in an attempt to dereference the wrong type of ->private_data, so
- * avoid doing that until we really have a good reason. NFS defines
- * several different file_system_type structures, but they all end up
- * using the same ->copy_file_range() function pointer.
- */
- if (file_out->f_op->copy_file_range &&
- file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
- return file_out->f_op->copy_file_range(file_in, pos_in,
- file_out, pos_out,
- len, flags);
-
- return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
-}
-
/*
* Performs necessary checks before doing a file copy
*
@@ -1440,6 +1418,24 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
if (ret)
return ret;
+ /*
+ * We allow some filesystems to handle cross sb copy, but passing
+ * a file of the wrong filesystem type to filesystem driver can result
+ * in an attempt to dereference the wrong type of ->private_data, so
+ * avoid doing that until we really have a good reason.
+ *
+ * nfs and cifs define several different file_system_type structures
+ * and several different sets of file_operations, but they all end up
+ * using the same ->copy_file_range() function pointer.
+ */
+ if (file_out->f_op->copy_file_range) {
+ if (file_in->f_op->copy_file_range !=
+ file_out->f_op->copy_file_range)
+ return -EXDEV;
+ } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
+ return -EXDEV;
+ }
+
/* Don't touch certain kinds of inodes */
if (IS_IMMUTABLE(inode_out))
return -EPERM;
@@ -1505,26 +1501,41 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
file_start_write(file_out);
/*
- * Try cloning first, this is supported by more file systems, and
- * more efficient if both clone and copy are supported (e.g. NFS).
+ * Cloning is supported by more file systems, so we implement copy on
+ * same sb using clone, but for filesystems where both clone and copy
+ * are supported (e.g. nfs,cifs), we only call the copy method.
*/
+ if (file_out->f_op->copy_file_range) {
+ ret = file_out->f_op->copy_file_range(file_in, pos_in,
+ file_out, pos_out,
+ len, flags);
+ goto done;
+ }
+
if (file_in->f_op->remap_file_range &&
file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
- loff_t cloned;
-
- cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
min_t(loff_t, MAX_RW_COUNT, len),
REMAP_FILE_CAN_SHORTEN);
- if (cloned > 0) {
- ret = cloned;
+ if (ret > 0)
goto done;
- }
}
- ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
- WARN_ON_ONCE(ret == -EOPNOTSUPP);
+ /*
+ * We can get here for same sb copy of filesystems that do not implement
+ * ->copy_file_range() in case filesystem does not support clone or in
+ * case filesystem supports clone but rejected the clone request (e.g.
+ * because it was not block aligned).
+ *
+ * In both cases, fall back to kernel copy so we are able to maintain a
+ * consistent story about which filesystems support copy_file_range()
+ * and which filesystems do not, that will allow userspace tools to
+ * make consistent desicions w.r.t using copy_file_range().
+ */
+ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+ flags);
+
done:
if (ret > 0) {
fsnotify_access(file_in);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e112b5424cdb..881a306ee247 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -71,7 +71,8 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
* Otherwise, make sure the count is also block-aligned, having
* already confirmed the starting offsets' block alignment.
*/
- if (pos_in + count == size_in) {
+ if (pos_in + count == size_in &&
+ (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
bcount = ALIGN(size_in, bs) - pos_in;
} else {
if (!IS_ALIGNED(count, bs))
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 1824f61621a2..224649a76cbb 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -50,7 +50,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
-STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
+STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args);
/*
* Internal routines when attribute list is more than one block.
@@ -393,16 +393,10 @@ xfs_attr_sf_addname(
* It won't fit in the shortform, transform to a leaf block. GROT:
* another possible req'mt for a double-split btree op.
*/
- error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp);
+ error = xfs_attr_shortform_to_leaf(args);
if (error)
return error;
- /*
- * Prevent the leaf buffer from being unlocked so that a concurrent AIL
- * push cannot grab the half-baked leaf buffer and run into problems
- * with the write verifier.
- */
- xfs_trans_bhold(args->trans, attr->xattri_leaf_bp);
attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
out:
trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
@@ -447,11 +441,9 @@ xfs_attr_leaf_addname(
/*
* Use the leaf buffer we may already hold locked as a result of
- * a sf-to-leaf conversion. The held buffer is no longer valid
- * after this call, regardless of the result.
+ * a sf-to-leaf conversion.
*/
- error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp);
- attr->xattri_leaf_bp = NULL;
+ error = xfs_attr_leaf_try_add(args);
if (error == -ENOSPC) {
error = xfs_attr3_leaf_to_node(args);
@@ -497,8 +489,6 @@ xfs_attr_node_addname(
struct xfs_da_args *args = attr->xattri_da_args;
int error;
- ASSERT(!attr->xattri_leaf_bp);
-
error = xfs_attr_node_addname_find_attr(attr);
if (error)
return error;
@@ -1215,24 +1205,14 @@ xfs_attr_restore_rmt_blk(
*/
STATIC int
xfs_attr_leaf_try_add(
- struct xfs_da_args *args,
- struct xfs_buf *bp)
+ struct xfs_da_args *args)
{
+ struct xfs_buf *bp;
int error;
- /*
- * If the caller provided a buffer to us, it is locked and held in
- * the transaction because it just did a shortform to leaf conversion.
- * Hence we don't need to read it again. Otherwise read in the leaf
- * buffer.
- */
- if (bp) {
- xfs_trans_bhold_release(args->trans, bp);
- } else {
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
- if (error)
- return error;
- }
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
/*
* Look up the xattr name to set the insertion point for the new xattr.
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index b4a2fc77017e..dfb47fa63c6d 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -515,11 +515,6 @@ struct xfs_attr_intent {
*/
struct xfs_attri_log_nameval *xattri_nameval;
- /*
- * Used by xfs_attr_set to hold a leaf buffer across a transaction roll
- */
- struct xfs_buf *xattri_leaf_bp;
-
/* Used to keep track of current state of delayed operation */
enum xfs_delattr_state xattri_dela_state;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 37e7c33f6283..8f47396f8dd2 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -289,6 +289,23 @@ xfs_attr3_leaf_verify_entry(
return NULL;
}
+/*
+ * Validate an attribute leaf block.
+ *
+ * Empty leaf blocks can occur under the following circumstances:
+ *
+ * 1. setxattr adds a new extended attribute to a file;
+ * 2. The file has zero existing attributes;
+ * 3. The attribute is too large to fit in the attribute fork;
+ * 4. The attribute is small enough to fit in a leaf block;
+ * 5. A log flush occurs after committing the transaction that creates
+ * the (empty) leaf block; and
+ * 6. The filesystem goes down after the log flush but before the new
+ * attribute can be committed to the leaf block.
+ *
+ * Hence we need to ensure that we don't fail the validation purely
+ * because the leaf is empty.
+ */
static xfs_failaddr_t
xfs_attr3_leaf_verify(
struct xfs_buf *bp)
@@ -311,15 +328,6 @@ xfs_attr3_leaf_verify(
return fa;
/*
- * Empty leaf blocks should never occur; they imply the existence of a
- * software bug that needs fixing. xfs_repair also flags them as a
- * corruption that needs fixing, so we should never let these go to
- * disk.
- */
- if (ichdr.count == 0)
- return __this_address;
-
- /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -922,14 +930,10 @@ xfs_attr_shortform_getvalue(
return -ENOATTR;
}
-/*
- * Convert from using the shortform to the leaf. On success, return the
- * buffer so that we can keep it locked until we're totally done with it.
- */
+/* Convert from using the shortform to the leaf format. */
int
xfs_attr_shortform_to_leaf(
- struct xfs_da_args *args,
- struct xfs_buf **leaf_bp)
+ struct xfs_da_args *args)
{
struct xfs_inode *dp;
struct xfs_attr_shortform *sf;
@@ -991,7 +995,6 @@ xfs_attr_shortform_to_leaf(
sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
- *leaf_bp = bp;
out:
kmem_free(tmpbuffer);
return error;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index efa757f1e912..368f4d9fa1d5 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
- struct xfs_buf **leaf_bp);
+int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_sf_removename(struct xfs_da_args *args);
int xfs_attr_sf_findname(struct xfs_da_args *args,
struct xfs_attr_sf_entry **sfep,
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 135d44133477..5077a7ad5646 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -576,7 +576,7 @@ xfs_attri_item_recover(
struct xfs_trans_res tres;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
- int error, ret = 0;
+ int error;
int total;
int local;
struct xfs_attrd_log_item *done_item = NULL;
@@ -655,29 +655,32 @@ xfs_attri_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- ret = xfs_xattri_finish_update(attr, done_item);
- if (ret == -EAGAIN) {
- /* There's more work to do, so add it to this transaction */
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error == -EAGAIN) {
+ /*
+ * There's more work to do, so add the intent item to this
+ * transaction so that we can continue it later.
+ */
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
- } else
- error = ret;
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ if (error)
+ goto out_unlock;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ return 0;
+ }
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
}
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
-
out_unlock:
- if (attr->xattri_leaf_bp)
- xfs_buf_relse(attr->xattri_leaf_bp);
-
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
out:
- if (ret != -EAGAIN)
- xfs_attr_free_item(attr);
+ xfs_attr_free_item(attr);
return error;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 52be58372c63..85e1a26c92e8 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -686,6 +686,8 @@ xfs_can_free_eofblocks(
* forever.
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+ end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 5269354b1b69..2609825d53ee 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -440,7 +440,7 @@ xfs_inodegc_queue_all(
for_each_online_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
}
}
@@ -1841,8 +1841,8 @@ void
xfs_inodegc_worker(
struct work_struct *work)
{
- struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc,
- work);
+ struct xfs_inodegc *gc = container_of(to_delayed_work(work),
+ struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
@@ -1862,19 +1862,29 @@ xfs_inodegc_worker(
}
/*
- * Force all currently queued inode inactivation work to run immediately and
- * wait for the work to finish.
+ * Expedite all pending inodegc work to run immediately. This does not wait for
+ * completion of the work.
*/
void
-xfs_inodegc_flush(
+xfs_inodegc_push(
struct xfs_mount *mp)
{
if (!xfs_is_inodegc_enabled(mp))
return;
+ trace_xfs_inodegc_push(mp, __return_address);
+ xfs_inodegc_queue_all(mp);
+}
+/*
+ * Force all currently queued inode inactivation work to run immediately and
+ * wait for the work to finish.
+ */
+void
+xfs_inodegc_flush(
+ struct xfs_mount *mp)
+{
+ xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
-
- xfs_inodegc_queue_all(mp);
flush_workqueue(mp->m_inodegc_wq);
}
@@ -2014,6 +2024,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
+ unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
spin_lock(&ip->i_flags_lock);
@@ -2025,19 +2036,26 @@ xfs_inodegc_queue(
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
- put_cpu_ptr(gc);
- if (!xfs_is_inodegc_enabled(mp))
+ /*
+ * We queue the work while holding the current CPU so that the work
+ * is scheduled to run on this CPU.
+ */
+ if (!xfs_is_inodegc_enabled(mp)) {
+ put_cpu_ptr(gc);
return;
-
- if (xfs_inodegc_want_queue_work(ip, items)) {
- trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
}
+ if (xfs_inodegc_want_queue_work(ip, items))
+ queue_delay = 0;
+
+ trace_xfs_inodegc_queue(mp, __return_address);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
+ put_cpu_ptr(gc);
+
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
- flush_work(&gc->work);
+ flush_delayed_work(&gc->work);
}
}
@@ -2054,7 +2072,7 @@ xfs_inodegc_cpu_dead(
unsigned int count = 0;
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
- cancel_work_sync(&dead_gc->work);
+ cancel_delayed_work_sync(&dead_gc->work);
if (llist_empty(&dead_gc->list))
return;
@@ -2073,12 +2091,12 @@ xfs_inodegc_cpu_dead(
llist_add_batch(first, last, &gc->list);
count += READ_ONCE(gc->items);
WRITE_ONCE(gc->items, count);
- put_cpu_ptr(gc);
if (xfs_is_inodegc_enabled(mp)) {
trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
}
+ put_cpu_ptr(gc);
}
/*
@@ -2173,7 +2191,7 @@ xfs_inodegc_shrinker_scan(
unsigned int h = READ_ONCE(gc->shrinker_hits);
WRITE_ONCE(gc->shrinker_hits, h + 1);
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
no_items = false;
}
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 2e4cfddf8b8e..6cd180721659 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_push(struct xfs_mount *mp);
void xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 52d6f2c7d58b..3e1c62ffa4f7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -132,6 +132,26 @@ xfs_ilock_attr_map_shared(
}
/*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
+ * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
+ * to set in lock_flags.
+ */
+static inline void
+xfs_lock_flags_assert(
+ uint lock_flags)
+{
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+}
+
+/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
* multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
@@ -168,18 +188,7 @@ xfs_ilock(
{
trace_xfs_ilock(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
down_write_nested(&VFS_I(ip)->i_rwsem,
@@ -222,18 +231,7 @@ xfs_ilock_nowait(
{
trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
@@ -291,19 +289,7 @@ xfs_iunlock(
xfs_inode_t *ip,
uint lock_flags)
{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
- ASSERT(lock_flags != 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -379,8 +365,8 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
- (lock_flags & XFS_IOLOCK_SHARED));
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
+ (lock_flags & XFS_MMAPLOCK_SHARED));
}
if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 1e972f884a81..ae904b21e9cc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2092,8 +2092,6 @@ xlog_dealloc_log(
xlog_in_core_t *iclog, *next_iclog;
int i;
- xlog_cil_destroy(log);
-
/*
* Cycle all the iclogbuf locks to make sure all log IO completion
* is done before we tear down these buffers.
@@ -2105,6 +2103,13 @@ xlog_dealloc_log(
iclog = iclog->ic_next;
}
+ /*
+ * Destroy the CIL after waiting for iclog IO completion because an
+ * iclog EIO error will try to shut down the log, which accesses the
+ * CIL to wake up the waiters.
+ */
+ xlog_cil_destroy(log);
+
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ba5d42abf66e..d2eaebd85abf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -61,7 +61,7 @@ struct xfs_error_cfg {
*/
struct xfs_inodegc {
struct llist_head list;
- struct work_struct work;
+ struct delayed_work work;
/* approximate count of inodes in the list */
unsigned int items;
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74ac9ca9e119..392cb39cc10c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -454,9 +454,12 @@ xfs_qm_scall_getquota(
struct xfs_dquot *dqp;
int error;
- /* Flush inodegc work at the start of a quota reporting scan. */
+ /*
+ * Expedite pending inodegc work at the start of a quota reporting
+ * scan but don't block waiting for it to complete.
+ */
if (id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
/*
* Try to get the dquot. We don't want it allocated on disk, so don't
@@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next(
/* Flush inodegc work at the start of a quota reporting scan. */
if (*id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ed18160e6181..aa977c7ea370 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -797,8 +797,11 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
- /* Wait for whatever inactivations are in progress. */
- xfs_inodegc_flush(mp);
+ /*
+ * Expedite background inodegc but don't wait. We do not want to block
+ * here waiting hours for a billion extent file to be truncated.
+ */
+ xfs_inodegc_push(mp);
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -1074,7 +1077,7 @@ xfs_inodegc_init_percpu(
gc = per_cpu_ptr(mp->m_inodegc, cpu);
init_llist_head(&gc->list);
gc->items = 0;
- INIT_WORK(&gc->work, xfs_inodegc_worker);
+ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
}
return 0;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d32026585c1b..0fa1b7a2918c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \
TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
TP_ARGS(mp, caller_ip))
DEFINE_FS_EVENT(xfs_inodegc_flush);
+DEFINE_FS_EVENT(xfs_inodegc_push);
DEFINE_FS_EVENT(xfs_inodegc_start);
DEFINE_FS_EVENT(xfs_inodegc_stop);
DEFINE_FS_EVENT(xfs_inodegc_queue);