From f0bfa76a11e93d0fe2c896fcb566568c5e8b5d3f Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 28 Oct 2021 16:03:41 +0100
Subject: btrfs: fix ENOSPC failure when attempting direct IO write into NOCOW
 range

When doing a direct IO write against a file range that either has
preallocated extents in that range or has regular extents and the file
has the NOCOW attribute set, the write fails with -ENOSPC when all of
the following conditions are met:

1) There are no data blocks groups with enough free space matching
   the size of the write;

2) There's not enough unallocated space for allocating a new data block
   group;

3) The extents in the target file range are not shared, neither through
   snapshots nor through reflinks.

This is wrong because a NOCOW write can be done in such case, and in fact
it's possible to do it using a buffered IO write, since when failing to
allocate data space, the buffered IO path checks if a NOCOW write is
possible.

The failure in direct IO write path comes from the fact that early on,
at btrfs_dio_iomap_begin(), we try to allocate data space for the write
and if it that fails we return the error and stop - we never check if we
can do NOCOW. But later, at btrfs_get_blocks_direct_write(), we check
if we can do a NOCOW write into the range, or a subset of the range, and
then release the previously reserved data space.

Fix this by doing the data reservation only if needed, when we must COW,
at btrfs_get_blocks_direct_write() instead of doing it at
btrfs_dio_iomap_begin(). This also simplifies a bit the logic and removes
the inneficiency of doing unnecessary data reservations.

The following example test script reproduces the problem:

  $ cat dio-nocow-enospc.sh
  #!/bin/bash

  DEV=/dev/sdj
  MNT=/mnt/sdj

  # Use a small fixed size (1G) filesystem so that it's quick to fill
  # it up.
  # Make sure the mixed block groups feature is not enabled because we
  # later want to not have more space available for allocating data
  # extents but still have enough metadata space free for the file writes.
  mkfs.btrfs -f -b $((1024 * 1024 * 1024)) -O ^mixed-bg $DEV
  mount $DEV $MNT

  # Create our test file with the NOCOW attribute set.
  touch $MNT/foobar
  chattr +C $MNT/foobar

  # Now fill in all unallocated space with data for our test file.
  # This will allocate a data block group that will be full and leave
  # no (or a very small amount of) unallocated space in the device, so
  # that it will not be possible to allocate a new block group later.
  echo
  echo "Creating test file with initial data..."
  xfs_io -c "pwrite -S 0xab -b 1M 0 900M" $MNT/foobar

  # Now try a direct IO write against file range [0, 10M[.
  # This should succeed since this is a NOCOW file and an extent for the
  # range was previously allocated.
  echo
  echo "Trying direct IO write over allocated space..."
  xfs_io -d -c "pwrite -S 0xcd -b 10M 0 10M" $MNT/foobar

  umount $MNT

When running the test:

  $ ./dio-nocow-enospc.sh
  (...)

  Creating test file with initial data...
  wrote 943718400/943718400 bytes at offset 0
  900 MiB, 900 ops; 0:00:01.43 (625.526 MiB/sec and 625.5265 ops/sec)

  Trying direct IO write over allocated space...
  pwrite: No space left on device

A test case for fstests will follow, testing both this direct IO write
scenario as well as the buffered IO write scenario to make it less likely
to get future regressions on the buffered IO case.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 142 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 78 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b8c911a4a320..549853622b64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -61,8 +61,6 @@ struct btrfs_iget_args {
 };
 
 struct btrfs_dio_data {
-	u64 reserve;
-	loff_t length;
 	ssize_t submitted;
 	struct extent_changeset *data_reserved;
 };
@@ -7773,6 +7771,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct extent_map *em = *map;
+	int type;
+	u64 block_start, orig_start, orig_block_len, ram_bytes;
+	bool can_nocow = false;
+	bool space_reserved = false;
 	int ret = 0;
 
 	/*
@@ -7787,9 +7789,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 	     em->block_start != EXTENT_MAP_HOLE)) {
-		int type;
-		u64 block_start, orig_start, orig_block_len, ram_bytes;
-
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			type = BTRFS_ORDERED_PREALLOC;
 		else
@@ -7799,53 +7798,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
 
 		if (can_nocow_extent(inode, start, &len, &orig_start,
 				     &orig_block_len, &ram_bytes, false) == 1 &&
-		    btrfs_inc_nocow_writers(fs_info, block_start)) {
-			struct extent_map *em2;
+		    btrfs_inc_nocow_writers(fs_info, block_start))
+			can_nocow = true;
+	}
 
-			em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
-						      orig_start, block_start,
-						      len, orig_block_len,
-						      ram_bytes, type);
+	if (can_nocow) {
+		struct extent_map *em2;
+
+		/* We can NOCOW, so only need to reserve metadata space. */
+		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+		if (ret < 0) {
+			/* Our caller expects us to free the input extent map. */
+			free_extent_map(em);
+			*map = NULL;
 			btrfs_dec_nocow_writers(fs_info, block_start);
-			if (type == BTRFS_ORDERED_PREALLOC) {
-				free_extent_map(em);
-				*map = em = em2;
-			}
+			goto out;
+		}
+		space_reserved = true;
 
-			if (em2 && IS_ERR(em2)) {
-				ret = PTR_ERR(em2);
-				goto out;
-			}
-			/*
-			 * For inode marked NODATACOW or extent marked PREALLOC,
-			 * use the existing or preallocated extent, so does not
-			 * need to adjust btrfs_space_info's bytes_may_use.
-			 */
-			btrfs_free_reserved_data_space_noquota(fs_info, len);
-			goto skip_cow;
+		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+					      orig_start, block_start,
+					      len, orig_block_len,
+					      ram_bytes, type);
+		btrfs_dec_nocow_writers(fs_info, block_start);
+		if (type == BTRFS_ORDERED_PREALLOC) {
+			free_extent_map(em);
+			*map = em = em2;
 		}
-	}
 
-	/* this will cow the extent */
-	free_extent_map(em);
-	*map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
-	if (IS_ERR(em)) {
-		ret = PTR_ERR(em);
-		goto out;
+		if (IS_ERR(em2)) {
+			ret = PTR_ERR(em2);
+			goto out;
+		}
+	} else {
+		const u64 prev_len = len;
+
+		/* Our caller expects us to free the input extent map. */
+		free_extent_map(em);
+		*map = NULL;
+
+		/* We have to COW, so need to reserve metadata and data space. */
+		ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+						   &dio_data->data_reserved,
+						   start, len);
+		if (ret < 0)
+			goto out;
+		space_reserved = true;
+
+		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+		*map = em;
+		len = min(len, em->len - (start - em->start));
+		if (len < prev_len)
+			btrfs_delalloc_release_space(BTRFS_I(inode),
+						     dio_data->data_reserved,
+						     start + len, prev_len - len,
+						     true);
 	}
 
-	len = min(len, em->len - (start - em->start));
+	/*
+	 * We have created our ordered extent, so we can now release our reservation
+	 * for an outstanding extent.
+	 */
+	btrfs_delalloc_release_extents(BTRFS_I(inode), len);
 
-skip_cow:
 	/*
 	 * Need to update the i_size under the extent lock so buffered
 	 * readers will get the updated i_size when we unlock.
 	 */
 	if (start + len > i_size_read(inode))
 		i_size_write(inode, start + len);
-
-	dio_data->reserve -= len;
 out:
+	if (ret && space_reserved) {
+		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+		if (can_nocow) {
+			btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+		} else {
+			btrfs_delalloc_release_space(BTRFS_I(inode),
+						     dio_data->data_reserved,
+						     start, len, true);
+			extent_changeset_free(dio_data->data_reserved);
+			dio_data->data_reserved = NULL;
+		}
+	}
 	return ret;
 }
 
@@ -7887,18 +7925,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	if (!dio_data)
 		return -ENOMEM;
 
-	dio_data->length = length;
-	if (write) {
-		dio_data->reserve = round_up(length, fs_info->sectorsize);
-		ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
-				&dio_data->data_reserved,
-				start, dio_data->reserve);
-		if (ret) {
-			extent_changeset_free(dio_data->data_reserved);
-			kfree(dio_data);
-			return ret;
-		}
-	}
 	iomap->private = dio_data;
 
 
@@ -7991,14 +8017,8 @@ unlock_err:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state);
 err:
-	if (dio_data) {
-		btrfs_delalloc_release_space(BTRFS_I(inode),
-				dio_data->data_reserved, start,
-				dio_data->reserve, true);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
-		extent_changeset_free(dio_data->data_reserved);
-		kfree(dio_data);
-	}
+	kfree(dio_data);
+
 	return ret;
 }
 
@@ -8028,14 +8048,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 		ret = -ENOTBLK;
 	}
 
-	if (write) {
-		if (dio_data->reserve)
-			btrfs_delalloc_release_space(BTRFS_I(inode),
-					dio_data->data_reserved, pos,
-					dio_data->reserve, true);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+	if (write)
 		extent_changeset_free(dio_data->data_reserved);
-	}
 out:
 	kfree(dio_data);
 	iomap->private = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 232796df8c1437c41d308d161007f0715bac0a54 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 27 Oct 2021 18:30:25 +0100
Subject: btrfs: fix deadlock between quota enable and other quota operations

When enabling quotas, we attempt to commit a transaction while holding the
mutex fs_info->qgroup_ioctl_lock. This can result on a deadlock with other
quota operations such as:

- qgroup creation and deletion, ioctl BTRFS_IOC_QGROUP_CREATE;

- adding and removing qgroup relations, ioctl BTRFS_IOC_QGROUP_ASSIGN.

This is because these operations join a transaction and after that they
attempt to lock the mutex fs_info->qgroup_ioctl_lock. Acquiring that mutex
after joining or starting a transaction is a pattern followed everywhere
in qgroups, so the quota enablement operation is the one at fault here,
and should not commit a transaction while holding that mutex.

Fix this by making the transaction commit while not holding the mutex.
We are safe from two concurrent tasks trying to enable quotas because
we are serialized by the rw semaphore fs_info->subvol_sem at
btrfs_ioctl_quota_ctl(), which is the only call site for enabling
quotas.

When this deadlock happens, it produces a trace like the following:

  INFO: task syz-executor:25604 blocked for more than 143 seconds.
  Not tainted 5.15.0-rc6 #4
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  task:syz-executor state:D stack:24800 pid:25604 ppid: 24873 flags:0x00004004
  Call Trace:
  context_switch kernel/sched/core.c:4940 [inline]
  __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
  schedule+0xd3/0x270 kernel/sched/core.c:6366
  btrfs_commit_transaction+0x994/0x2e90 fs/btrfs/transaction.c:2201
  btrfs_quota_enable+0x95c/0x1790 fs/btrfs/qgroup.c:1120
  btrfs_ioctl_quota_ctl fs/btrfs/ioctl.c:4229 [inline]
  btrfs_ioctl+0x637e/0x7b70 fs/btrfs/ioctl.c:5010
  vfs_ioctl fs/ioctl.c:51 [inline]
  __do_sys_ioctl fs/ioctl.c:874 [inline]
  __se_sys_ioctl fs/ioctl.c:860 [inline]
  __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
  entry_SYSCALL_64_after_hwframe+0x44/0xae
  RIP: 0033:0x7f86920b2c4d
  RSP: 002b:00007f868f61ac58 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
  RAX: ffffffffffffffda RBX: 00007f86921d90a0 RCX: 00007f86920b2c4d
  RDX: 0000000020005e40 RSI: 00000000c0109428 RDI: 0000000000000008
  RBP: 00007f869212bd80 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000246 R12: 00007f86921d90a0
  R13: 00007fff6d233e4f R14: 00007fff6d233ff0 R15: 00007f868f61adc0
  INFO: task syz-executor:25628 blocked for more than 143 seconds.
  Not tainted 5.15.0-rc6 #4
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  task:syz-executor state:D stack:29080 pid:25628 ppid: 24873 flags:0x00004004
  Call Trace:
  context_switch kernel/sched/core.c:4940 [inline]
  __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
  schedule+0xd3/0x270 kernel/sched/core.c:6366
  schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425
  __mutex_lock_common kernel/locking/mutex.c:669 [inline]
  __mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729
  btrfs_remove_qgroup+0xb7/0x7d0 fs/btrfs/qgroup.c:1548
  btrfs_ioctl_qgroup_create fs/btrfs/ioctl.c:4333 [inline]
  btrfs_ioctl+0x683c/0x7b70 fs/btrfs/ioctl.c:5014
  vfs_ioctl fs/ioctl.c:51 [inline]
  __do_sys_ioctl fs/ioctl.c:874 [inline]
  __se_sys_ioctl fs/ioctl.c:860 [inline]
  __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
  entry_SYSCALL_64_after_hwframe+0x44/0xae

Reported-by: Hao Sun <sunhao.th@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CACkBjsZQF19bQ1C6=yetF3BvL10OSORpFUcWXTP6HErshDB4dQ@mail.gmail.com/
Fixes: 340f1aa27f36 ("btrfs: qgroups: Move transaction management inside btrfs_quota_enable/disable")
CC: stable@vger.kernel.org # 4.19
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 6c037f1252b7..071f7334f818 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 	int ret = 0;
 	int slot;
 
+	/*
+	 * We need to have subvol_sem write locked, to prevent races between
+	 * concurrent tasks trying to enable quotas, because we will unlock
+	 * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+	 * and before setting BTRFS_FS_QUOTA_ENABLED.
+	 */
+	lockdep_assert_held_write(&fs_info->subvol_sem);
+
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (fs_info->quota_root)
 		goto out;
@@ -1117,8 +1125,19 @@ out_add_root:
 		goto out_free_path;
 	}
 
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	/*
+	 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+	 * a deadlock with tasks concurrently doing other qgroup operations, such
+	 * adding/removing qgroups or adding/deleting qgroup relations for example,
+	 * because all qgroup operations first start or join a transaction and then
+	 * lock the qgroup_ioctl_lock mutex.
+	 * We are safe from a concurrent task trying to enable quotas, by calling
+	 * this function, since we are serialized by fs_info->subvol_sem.
+	 */
 	ret = btrfs_commit_transaction(trans);
 	trans = NULL;
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (ret)
 		goto out_free_path;
 
-- 
cgit v1.2.3-59-g8ed1b


From 17130a65f0cd71f9c26bec8f0f097fc61013b6f8 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 14 Oct 2021 10:03:11 +0300
Subject: btrfs: remove spurious unlock/lock of unused_bgs_lock

Since both unused block groups and reclaim bgs lists are protected by
unused_bgs_lock then free them in the same critical section without
doing an extra unlock/lock pair.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 444e9c89ff3e..6ab864655090 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3911,9 +3911,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		list_del_init(&block_group->bg_list);
 		btrfs_put_block_group(block_group);
 	}
-	spin_unlock(&info->unused_bgs_lock);
 
-	spin_lock(&info->unused_bgs_lock);
 	while (!list_empty(&info->reclaim_bgs)) {
 		block_group = list_first_entry(&info->reclaim_bgs,
 					       struct btrfs_block_group,
-- 
cgit v1.2.3-59-g8ed1b


From 339d035424849c89fe29913d07b08b153596bfb8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 17:31:53 +0100
Subject: btrfs: only copy dir index keys when logging a directory

Currently, when logging a directory, we copy both dir items and dir index
items from the fs/subvolume tree to the log tree. Both items have exactly
the same data (same struct btrfs_dir_item), the difference lies in the key
values, where a dir index key contains the index number of a directory
entry while the dir item key does not, as it's used for doing fast lookups
of an entry by name, while the former is used for sorting entries when
listing a directory.

We can exploit that and log only the dir index items, since they contain
all the information needed to correctly add, replace and delete directory
entries when replaying a log tree. Logging only the dir index items is
also backward and forward compatible: an unpatched kernel (without this
change) can correctly replay a log tree generated by a patched kernel
(with this patch), and a patched kernel can correctly replay a log tree
generated by an unpatched kernel.

The backward compatibility is ensured because:

1) For inserting a new dentry: a dentry is only inserted when we find a
   new dir index key - we can only insert if we know the dir index offset,
   which is encoded in the dir index key's offset;

2) For deleting dentries: during log replay, before adding or replacing
   dentries, we first replay dentry deletions. Whenever we find a dir item
   key or a dir index key in the subvolume/fs tree that is not logged in
   a range for which the log tree is authoritative, we do the unlink of
   the dentry, which removes both the existing dir item key and the dir
   index key. Therefore logging just dir index keys is enough to ensure
   dentry deletions are correctly replayed;

3) For dentry replacements: they work when we log only dir index keys
   and this is mostly due to a combination of 1) and 2). If we replace a
   dentry with name "foobar" to point from inode A to inode B, then we
   know the dir index key for the new dentry is different from the old
   one, as it has an index number (key offset) larger than the old one.
   This results in replaying a deletion, through replay_dir_deletes(),
   that causes the old dentry to be removed, both the dir item key and
   the dir index key, as mentioned at 2). Then when processing the new
   dir index key, we add the new dentry, adding both a new dir item key
   and a new index key pointing to inode B, as stated in 1).

The forward compatibility, the ability for a patched kernel to replay a
log created by an older, unpatched kernel, comes from the changes required
for making sure we are able to replay a log that only contains dir index
keys - we simply ignore every dir item key we find.

So modify directory logging to log only dir index items, and modify the
log replay process to ignore dir item keys, from log trees created by an
unpatched kernel, and process only with dir index keys. This reduces the
amount of logged metadata by about half, and therefore the time spent
logging or fsyncing large directories (less CPU time and less IO).

The following test script was used to measure this change:

   #!/bin/bash

   DEV=/dev/nvme0n1
   MNT=/mnt/nvme0n1

   NUM_NEW_FILES=1000000
   NUM_FILE_DELETES=10000

   mkfs.btrfs -f $DEV
   mount -o ssd $DEV $MNT

   mkdir $MNT/testdir

   for ((i = 1; i <= $NUM_NEW_FILES; i++)); do
           echo -n > $MNT/testdir/file_$i
   done

   start=$(date +%s%N)
   xfs_io -c "fsync" $MNT/testdir
   end=$(date +%s%N)

   dur=$(( (end - start) / 1000000 ))
   echo "dir fsync took $dur ms after adding $NUM_NEW_FILES files"

   # sync to force transaction commit and wipeout the log.
   sync

   del_inc=$(( $NUM_NEW_FILES / $NUM_FILE_DELETES ))
   for ((i = 1; i <= $NUM_NEW_FILES; i += $del_inc)); do
           rm -f $MNT/testdir/file_$i
   done

   start=$(date +%s%N)
   xfs_io -c "fsync" $MNT/testdir
   end=$(date +%s%N)

   dur=$(( (end - start) / 1000000 ))
   echo "dir fsync took $dur ms after deleting $NUM_FILE_DELETES files"
   echo

   umount $MNT

The tests were run on a physical machine, with a non-debug kernel (Debian's
default kernel config), for different values of $NUM_NEW_FILES and
$NUM_FILE_DELETES, and the results were the following:

** Before patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **

dir fsync took 8412 ms after adding 1000000 files
dir fsync took 500 ms after deleting 10000 files

** After patch, NUM_NEW_FILES = 1 000 000, NUM_DELETE_FILES = 10 000 **

dir fsync took 4252 ms after adding 1000000 files   (-49.5%)
dir fsync took 269 ms after deleting 10000 files    (-46.2%)

** Before patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 745 ms after adding 100000 files
dir fsync took 59 ms after deleting 1000 files

** After patch, NUM_NEW_FILES = 100 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 404 ms after adding 100000 files   (-45.8%)
dir fsync took 31 ms after deleting 1000 files    (-47.5%)

** Before patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 67 ms after adding 10000 files
dir fsync took 9 ms after deleting 1000 files

** After patch, NUM_NEW_FILES = 10 000, NUM_DELETE_FILES = 1 000 **

dir fsync took 36 ms after adding 10000 files   (-46.3%)
dir fsync took 5 ms after deleting 1000 files   (-44.4%)

** Before patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **

dir fsync took 9 ms after adding 1000 files
dir fsync took 4 ms after deleting 100 files

** After patch, NUM_NEW_FILES = 1 000, NUM_DELETE_FILES = 100 **

dir fsync took 7 ms after adding 1000 files     (-22.2%)
dir fsync took 3 ms after deleting 100 files    (-25.0%)

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  18 +--
 fs/btrfs/tree-log.c    | 395 ++++++++++++++++++++++---------------------------
 2 files changed, 182 insertions(+), 231 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ab2a4a52e0bb..b3e46aabc3d8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,19 +138,11 @@ struct btrfs_inode {
 	/* a local copy of root's last_log_commit */
 	int last_log_commit;
 
-	union {
-		/*
-		 * Total number of bytes pending delalloc, used by stat to
-		 * calculate the real block usage of the file. This is used
-		 * only for files.
-		 */
-		u64 delalloc_bytes;
-		/*
-		 * The offset of the last dir item key that was logged.
-		 * This is used only for directories.
-		 */
-		u64 last_dir_item_offset;
-	};
+	/*
+	 * Total number of bytes pending delalloc, used by stat to calculate the
+	 * real block usage of the file. This is used only for files.
+	 */
+	u64 delalloc_bytes;
 
 	union {
 		/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6993dcdba6f1..06defcd559a0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1950,6 +1950,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+					struct btrfs_inode *dir,
+					struct btrfs_path *path,
+					struct btrfs_dir_item *dst_di,
+					const struct btrfs_key *log_key,
+					u8 log_type,
+					bool exists)
+{
+	struct btrfs_key found_key;
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* The existing dentry points to the same inode, don't delete it. */
+	if (found_key.objectid == log_key->objectid &&
+	    found_key.type == log_key->type &&
+	    found_key.offset == log_key->offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+		return 1;
+
+	/*
+	 * Don't drop the conflicting directory entry if the inode for the new
+	 * entry doesn't exist.
+	 */
+	if (!exists)
+		return 0;
+
+	return drop_one_dir_item(trans, path, dir, dst_di);
+}
+
 /*
  * take a single entry in a log directory item and replay it into
  * the subvolume.
@@ -1975,14 +2003,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 {
 	char *name;
 	int name_len;
-	struct btrfs_dir_item *dst_di;
-	struct btrfs_key found_key;
+	struct btrfs_dir_item *dir_dst_di;
+	struct btrfs_dir_item *index_dst_di;
+	bool dir_dst_matches = false;
+	bool index_dst_matches = false;
 	struct btrfs_key log_key;
+	struct btrfs_key search_key;
 	struct inode *dir;
 	u8 log_type;
 	bool exists;
 	int ret;
-	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+	bool update_size = true;
 	bool name_added = false;
 
 	dir = read_one_inode(root, key->objectid);
@@ -2008,76 +2039,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	exists = (ret == 0);
 	ret = 0;
 
-	if (key->type == BTRFS_DIR_ITEM_KEY) {
-		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
-				       name, name_len, 1);
-	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
-		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
-						     key->objectid,
-						     key->offset, name,
-						     name_len, 1);
-	} else {
-		/* Corruption */
-		ret = -EINVAL;
+	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+					   name, name_len, 1);
+	if (IS_ERR(dir_dst_di)) {
+		ret = PTR_ERR(dir_dst_di);
 		goto out;
+	} else if (dir_dst_di) {
+		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+						   dir_dst_di, &log_key, log_type,
+						   exists);
+		if (ret < 0)
+			goto out;
+		dir_dst_matches = (ret == 1);
 	}
 
-	if (IS_ERR(dst_di)) {
-		ret = PTR_ERR(dst_di);
+	btrfs_release_path(path);
+
+	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						   key->objectid, key->offset,
+						   name, name_len, 1);
+	if (IS_ERR(index_dst_di)) {
+		ret = PTR_ERR(index_dst_di);
 		goto out;
-	} else if (!dst_di) {
-		/* we need a sequence number to insert, so we only
-		 * do inserts for the BTRFS_DIR_INDEX_KEY types
-		 */
-		if (key->type != BTRFS_DIR_INDEX_KEY)
+	} else if (index_dst_di) {
+		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+						   index_dst_di, &log_key,
+						   log_type, exists);
+		if (ret < 0)
 			goto out;
-		goto insert;
+		index_dst_matches = (ret == 1);
 	}
 
-	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
-	/* the existing item matches the logged item */
-	if (found_key.objectid == log_key.objectid &&
-	    found_key.type == log_key.type &&
-	    found_key.offset == log_key.offset &&
-	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+	btrfs_release_path(path);
+
+	if (dir_dst_matches && index_dst_matches) {
+		ret = 0;
 		update_size = false;
 		goto out;
 	}
 
-	/*
-	 * don't drop the conflicting directory entry if the inode
-	 * for the new entry doesn't exist
-	 */
-	if (!exists)
-		goto out;
-
-	ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
-	if (ret)
-		goto out;
-
-	if (key->type == BTRFS_DIR_INDEX_KEY)
-		goto insert;
-out:
-	btrfs_release_path(path);
-	if (!ret && update_size) {
-		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
-	}
-	kfree(name);
-	iput(dir);
-	if (!ret && name_added)
-		ret = 1;
-	return ret;
-
-insert:
 	/*
 	 * Check if the inode reference exists in the log for the given name,
 	 * inode and parent inode
 	 */
-	found_key.objectid = log_key.objectid;
-	found_key.type = BTRFS_INODE_REF_KEY;
-	found_key.offset = key->objectid;
-	ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+	search_key.objectid = log_key.objectid;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = key->objectid;
+	ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
 	if (ret < 0) {
 	        goto out;
 	} else if (ret) {
@@ -2087,10 +2095,10 @@ insert:
 	        goto out;
 	}
 
-	found_key.objectid = log_key.objectid;
-	found_key.type = BTRFS_INODE_EXTREF_KEY;
-	found_key.offset = key->objectid;
-	ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+	search_key.objectid = log_key.objectid;
+	search_key.type = BTRFS_INODE_EXTREF_KEY;
+	search_key.offset = key->objectid;
+	ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
 			     name_len);
 	if (ret < 0) {
 		goto out;
@@ -2109,87 +2117,76 @@ insert:
 		name_added = true;
 	update_size = false;
 	ret = 0;
-	goto out;
+
+out:
+	if (!ret && update_size) {
+		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+	}
+	kfree(name);
+	iput(dir);
+	if (!ret && name_added)
+		ret = 1;
+	return ret;
 }
 
-/*
- * find all the names in a directory item and reconcile them into
- * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
- * one name in a directory item, but the same code gets used for
- * both directory index types
- */
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct extent_buffer *eb, int slot,
 					struct btrfs_key *key)
 {
-	int ret = 0;
-	u32 item_size = btrfs_item_size_nr(eb, slot);
+	int ret;
 	struct btrfs_dir_item *di;
-	int name_len;
-	unsigned long ptr;
-	unsigned long ptr_end;
-	struct btrfs_path *fixup_path = NULL;
 
-	ptr = btrfs_item_ptr_offset(eb, slot);
-	ptr_end = ptr + item_size;
-	while (ptr < ptr_end) {
-		di = (struct btrfs_dir_item *)ptr;
-		name_len = btrfs_dir_name_len(eb, di);
-		ret = replay_one_name(trans, root, path, eb, di, key);
-		if (ret < 0)
-			break;
-		ptr = (unsigned long)(di + 1);
-		ptr += name_len;
+	/* We only log dir index keys, which only contain a single dir item. */
+	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
 
-		/*
-		 * If this entry refers to a non-directory (directories can not
-		 * have a link count > 1) and it was added in the transaction
-		 * that was not committed, make sure we fixup the link count of
-		 * the inode it the entry points to. Otherwise something like
-		 * the following would result in a directory pointing to an
-		 * inode with a wrong link that does not account for this dir
-		 * entry:
-		 *
-		 * mkdir testdir
-		 * touch testdir/foo
-		 * touch testdir/bar
-		 * sync
-		 *
-		 * ln testdir/bar testdir/bar_link
-		 * ln testdir/foo testdir/foo_link
-		 * xfs_io -c "fsync" testdir/bar
-		 *
-		 * <power failure>
-		 *
-		 * mount fs, log replay happens
-		 *
-		 * File foo would remain with a link count of 1 when it has two
-		 * entries pointing to it in the directory testdir. This would
-		 * make it impossible to ever delete the parent directory has
-		 * it would result in stale dentries that can never be deleted.
-		 */
-		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
-			struct btrfs_key di_key;
+	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+	ret = replay_one_name(trans, root, path, eb, di, key);
+	if (ret < 0)
+		return ret;
 
-			if (!fixup_path) {
-				fixup_path = btrfs_alloc_path();
-				if (!fixup_path) {
-					ret = -ENOMEM;
-					break;
-				}
-			}
+	/*
+	 * If this entry refers to a non-directory (directories can not have a
+	 * link count > 1) and it was added in the transaction that was not
+	 * committed, make sure we fixup the link count of the inode the entry
+	 * points to. Otherwise something like the following would result in a
+	 * directory pointing to an inode with a wrong link that does not account
+	 * for this dir entry:
+	 *
+	 * mkdir testdir
+	 * touch testdir/foo
+	 * touch testdir/bar
+	 * sync
+	 *
+	 * ln testdir/bar testdir/bar_link
+	 * ln testdir/foo testdir/foo_link
+	 * xfs_io -c "fsync" testdir/bar
+	 *
+	 * <power failure>
+	 *
+	 * mount fs, log replay happens
+	 *
+	 * File foo would remain with a link count of 1 when it has two entries
+	 * pointing to it in the directory testdir. This would make it impossible
+	 * to ever delete the parent directory has it would result in stale
+	 * dentries that can never be deleted.
+	 */
+	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+		struct btrfs_path *fixup_path;
+		struct btrfs_key di_key;
 
-			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
-			ret = link_to_fixup_dir(trans, root, fixup_path,
-						di_key.objectid);
-			if (ret)
-				break;
-		}
-		ret = 0;
+		fixup_path = btrfs_alloc_path();
+		if (!fixup_path)
+			return -ENOMEM;
+
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
+		btrfs_free_path(fixup_path);
 	}
-	btrfs_free_path(fixup_path);
+
 	return ret;
 }
 
@@ -2743,12 +2740,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 						eb, i, &key);
 			if (ret)
 				break;
-		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
-			ret = replay_one_dir_item(wc->trans, root, path,
-						  eb, i, &key);
-			if (ret)
-				break;
 		}
+		/*
+		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
+		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+		 * older kernel with such keys, ignore them.
+		 */
 	}
 	btrfs_free_path(path);
 	return ret;
@@ -3551,20 +3549,10 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 		goto out_unlock;
 	}
 
-	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
-				   name, name_len, -1);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto fail;
-	}
-	if (di) {
-		ret = btrfs_delete_one_dir_name(trans, log, path, di);
-		if (ret) {
-			err = ret;
-			goto fail;
-		}
-	}
-	btrfs_release_path(path);
+	/*
+	 * We only log dir index items of a directory, so we don't need to look
+	 * for dir item keys.
+	 */
 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
 					 index, name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -3628,7 +3616,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *log,
 				       struct btrfs_path *path,
-				       int key_type, u64 dirid,
+				       u64 dirid,
 				       u64 first_offset, u64 last_offset)
 {
 	int ret;
@@ -3637,10 +3625,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 
 	key.objectid = dirid;
 	key.offset = first_offset;
-	if (key_type == BTRFS_DIR_ITEM_KEY)
-		key.type = BTRFS_DIR_LOG_ITEM_KEY;
-	else
-		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	key.type = BTRFS_DIR_LOG_INDEX_KEY;
 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
 	if (ret)
 		return ret;
@@ -3732,7 +3717,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *inode,
 				  struct btrfs_path *path,
 				  struct btrfs_path *dst_path,
-				  int key_type,
 				  struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_root *log = inode->root->log_root;
@@ -3740,24 +3724,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 	const int nritems = btrfs_header_nritems(src);
 	const u64 ino = btrfs_ino(inode);
 	const bool inode_logged_before = inode_logged(trans, inode);
-	u64 last_logged_key_offset;
 	bool last_found = false;
 	int batch_start = 0;
 	int batch_size = 0;
 	int i;
 
-	if (key_type == BTRFS_DIR_ITEM_KEY)
-		last_logged_key_offset = inode->last_dir_item_offset;
-	else
-		last_logged_key_offset = inode->last_dir_index_offset;
-
 	for (i = path->slots[0]; i < nritems; i++) {
 		struct btrfs_key key;
 		int ret;
 
 		btrfs_item_key_to_cpu(src, &key, i);
 
-		if (key.objectid != ino || key.type != key_type) {
+		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
 			last_found = true;
 			break;
 		}
@@ -3806,7 +3784,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
 		 * we logged is in the log tree, saving time and avoiding adding
 		 * contention on the log tree.
 		 */
-		if (key.offset > last_logged_key_offset)
+		if (key.offset > inode->last_dir_index_offset)
 			goto add_to_batch;
 		/*
 		 * Check if the key was already logged before. If not we can add
@@ -3865,7 +3843,7 @@ add_to_batch:
 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			  struct btrfs_inode *inode,
 			  struct btrfs_path *path,
-			  struct btrfs_path *dst_path, int key_type,
+			  struct btrfs_path *dst_path,
 			  struct btrfs_log_ctx *ctx,
 			  u64 min_offset, u64 *last_offset_ret)
 {
@@ -3879,7 +3857,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	u64 ino = btrfs_ino(inode);
 
 	min_key.objectid = ino;
-	min_key.type = key_type;
+	min_key.type = BTRFS_DIR_INDEX_KEY;
 	min_key.offset = min_offset;
 
 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
@@ -3888,9 +3866,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * we didn't find anything from this transaction, see if there
 	 * is anything at all
 	 */
-	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+	if (ret != 0 || min_key.objectid != ino ||
+	    min_key.type != BTRFS_DIR_INDEX_KEY) {
 		min_key.objectid = ino;
-		min_key.type = key_type;
+		min_key.type = BTRFS_DIR_INDEX_KEY;
 		min_key.offset = (u64)-1;
 		btrfs_release_path(path);
 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -3898,7 +3877,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			btrfs_release_path(path);
 			return ret;
 		}
-		ret = btrfs_previous_item(root, path, ino, key_type);
+		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
 
 		/* if ret == 0 there are items for this type,
 		 * create a range to tell us the last key of this type.
@@ -3909,18 +3888,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			struct btrfs_key tmp;
 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
 					      path->slots[0]);
-			if (key_type == tmp.type)
+			if (tmp.type == BTRFS_DIR_INDEX_KEY)
 				first_offset = max(min_offset, tmp.offset) + 1;
 		}
 		goto done;
 	}
 
 	/* go backward to find any previous key */
-	ret = btrfs_previous_item(root, path, ino, key_type);
+	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
 	if (ret == 0) {
 		struct btrfs_key tmp;
 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-		if (key_type == tmp.type) {
+		if (tmp.type == BTRFS_DIR_INDEX_KEY) {
 			first_offset = tmp.offset;
 			ret = overwrite_item(trans, log, dst_path,
 					     path->nodes[0], path->slots[0],
@@ -3951,8 +3930,7 @@ search:
 	 * from our directory
 	 */
 	while (1) {
-		ret = process_dir_items_leaf(trans, inode, path, dst_path,
-					     key_type, ctx);
+		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
 		if (ret != 0) {
 			if (ret < 0)
 				err = ret;
@@ -3973,7 +3951,7 @@ search:
 			goto done;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
-		if (min_key.objectid != ino || min_key.type != key_type) {
+		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
 			last_offset = (u64)-1;
 			goto done;
 		}
@@ -4004,8 +3982,8 @@ done:
 		 * insert the log range keys to indicate where the log
 		 * is valid
 		 */
-		ret = insert_dir_log_key(trans, log, path, key_type,
-					 ino, first_offset, last_offset);
+		ret = insert_dir_log_key(trans, log, path, ino, first_offset,
+					 last_offset);
 		if (ret)
 			err = ret;
 	}
@@ -4033,35 +4011,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 	u64 min_key;
 	u64 max_key;
 	int ret;
-	int key_type = BTRFS_DIR_ITEM_KEY;
 
 	/*
 	 * If this is the first time we are being logged in the current
 	 * transaction, or we were logged before but the inode was evicted and
-	 * reloaded later, in which case its logged_trans is 0, reset the values
-	 * of the last logged key offsets. Note that we don't use the helper
+	 * reloaded later, in which case its logged_trans is 0, reset the value
+	 * of the last logged key offset. Note that we don't use the helper
 	 * function inode_logged() here - that is because the function returns
 	 * true after an inode eviction, assuming the worst case as it can not
 	 * know for sure if the inode was logged before. So we can not skip key
 	 * searches in the case the inode was evicted, because it may not have
 	 * been logged in this transaction and may have been logged in a past
-	 * transaction, so we need to reset the last dir item and index offsets
-	 * to (u64)-1.
+	 * transaction, so we need to reset the last dir index offset to (u64)-1.
 	 */
-	if (inode->logged_trans != trans->transid) {
-		inode->last_dir_item_offset = (u64)-1;
+	if (inode->logged_trans != trans->transid)
 		inode->last_dir_index_offset = (u64)-1;
-	}
-again:
+
 	min_key = 0;
 	max_key = 0;
-	if (key_type == BTRFS_DIR_ITEM_KEY)
-		ctx->last_dir_item_offset = inode->last_dir_item_offset;
-	else
-		ctx->last_dir_item_offset = inode->last_dir_index_offset;
+	ctx->last_dir_item_offset = inode->last_dir_index_offset;
 
 	while (1) {
-		ret = log_dir_items(trans, inode, path, dst_path, key_type,
+		ret = log_dir_items(trans, inode, path, dst_path,
 				ctx, min_key, &max_key);
 		if (ret)
 			return ret;
@@ -4070,13 +4041,8 @@ again:
 		min_key = max_key + 1;
 	}
 
-	if (key_type == BTRFS_DIR_ITEM_KEY) {
-		inode->last_dir_item_offset = ctx->last_dir_item_offset;
-		key_type = BTRFS_DIR_INDEX_KEY;
-		goto again;
-	} else {
-		inode->last_dir_index_offset = ctx->last_dir_item_offset;
-	}
+	inode->last_dir_index_offset = ctx->last_dir_item_offset;
+
 	return 0;
 }
 
@@ -5899,18 +5865,12 @@ struct btrfs_dir_list {
  *    link_to_fixup_dir());
  *
  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
- *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ *    while logging the inode's items new index items (key type
+ *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
  *    has a size that doesn't match the sum of the lengths of all the logged
- *    names. This does not result in a problem because if a dir_item key is
- *    logged but its matching dir_index key is not logged, at log replay time we
- *    don't use it to replay the respective name (see replay_one_name()). On the
- *    other hand if only the dir_index key ends up being logged, the respective
- *    name is added to the fs/subvol tree with both the dir_item and dir_index
- *    keys created (see replay_one_name()).
- *    The directory's inode item with a wrong i_size is not a problem as well,
- *    since we don't use it at log replay time to set the i_size in the inode
- *    item of the fs/subvol tree (see overwrite_item()).
+ *    names - this is ok, not a problem, because at log replay time we set the
+ *    directory's i_size to the correct value (see replay_one_name() and
+ *    do_overwrite_item()).
  */
 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -5956,7 +5916,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
 			goto next_dir_inode;
 
 		min_key.objectid = dir_elem->ino;
-		min_key.type = BTRFS_DIR_ITEM_KEY;
+		min_key.type = BTRFS_DIR_INDEX_KEY;
 		min_key.offset = 0;
 again:
 		btrfs_release_path(path);
@@ -5981,7 +5941,7 @@ process_leaf:
 
 			btrfs_item_key_to_cpu(leaf, &min_key, i);
 			if (min_key.objectid != dir_elem->ino ||
-			    min_key.type != BTRFS_DIR_ITEM_KEY)
+			    min_key.type != BTRFS_DIR_INDEX_KEY)
 				goto next_dir_inode;
 
 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
@@ -6795,15 +6755,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	 * was previously logged, make sure the next log attempt on the directory
 	 * is not skipped and logs the inode again. This is because the log may
 	 * not currently be authoritative for a range including the old
-	 * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
-	 * sure after a log replay we do not end up with both the new and old
-	 * dentries around (in case the inode is a directory we would have a
-	 * directory with two hard links and 2 inode references for different
-	 * parents). The next log attempt of old_dir will happen at
-	 * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
-	 * below, because we have previously set inode->last_unlink_trans to the
-	 * current transaction ID, either here or at btrfs_record_unlink_dir() in
-	 * case inode is a directory.
+	 * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
+	 * do not end up with both the new and old dentries around (in case the
+	 * inode is a directory we would have a directory with two hard links and
+	 * 2 inode references for different parents). The next log attempt of
+	 * old_dir will happen at btrfs_log_all_parents(), called through
+	 * btrfs_log_inode_parent() below, because we have previously set
+	 * inode->last_unlink_trans to the current transaction ID, either here or
+	 * at btrfs_record_unlink_dir() in case the inode is a directory.
 	 */
 	if (old_dir)
 		old_dir->logged_trans = 0;
-- 
cgit v1.2.3-59-g8ed1b


From ccae4a19c9140a34a0c5f0658812496dd8bbdeaf Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Oct 2021 17:31:54 +0100
Subject: btrfs: remove no longer needed logic for replaying directory deletes

Now that we log only dir index keys when logging a directory, we no longer
need to deal with dir item keys in the log replay code for replaying
directory deletes. This is also true for the case when we replay a log
tree created by a kernel that still logs dir items.

So remove the remaining code of the replay of directory deletes algorithm
that deals with dir item keys.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c             | 158 ++++++++++++++++++----------------------
 include/uapi/linux/btrfs_tree.h |   4 +-
 2 files changed, 72 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 06defcd559a0..3da8452f682c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2203,7 +2203,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
  */
 static noinline int find_dir_range(struct btrfs_root *root,
 				   struct btrfs_path *path,
-				   u64 dirid, int key_type,
+				   u64 dirid,
 				   u64 *start_ret, u64 *end_ret)
 {
 	struct btrfs_key key;
@@ -2216,7 +2216,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
 		return 1;
 
 	key.objectid = dirid;
-	key.type = key_type;
+	key.type = BTRFS_DIR_LOG_INDEX_KEY;
 	key.offset = *start_ret;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2230,7 +2230,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
 	if (ret != 0)
 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 
-	if (key.type != key_type || key.objectid != dirid) {
+	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
 		ret = 1;
 		goto next;
 	}
@@ -2257,7 +2257,7 @@ next:
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 
-	if (key.type != key_type || key.objectid != dirid) {
+	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
 		ret = 1;
 		goto out;
 	}
@@ -2288,95 +2288,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	int ret;
 	struct extent_buffer *eb;
 	int slot;
-	u32 item_size;
 	struct btrfs_dir_item *di;
-	struct btrfs_dir_item *log_di;
 	int name_len;
-	unsigned long ptr;
-	unsigned long ptr_end;
 	char *name;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	struct btrfs_key location;
 
-again:
+	/*
+	 * Currenly we only log dir index keys. Even if we replay a log created
+	 * by an older kernel that logged both dir index and dir item keys, all
+	 * we need to do is process the dir index keys, we (and our caller) can
+	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+	 */
+	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+
 	eb = path->nodes[0];
 	slot = path->slots[0];
-	item_size = btrfs_item_size_nr(eb, slot);
-	ptr = btrfs_item_ptr_offset(eb, slot);
-	ptr_end = ptr + item_size;
-	while (ptr < ptr_end) {
-		di = (struct btrfs_dir_item *)ptr;
-		name_len = btrfs_dir_name_len(eb, di);
-		name = kmalloc(name_len, GFP_NOFS);
-		if (!name) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		read_extent_buffer(eb, name, (unsigned long)(di + 1),
-				  name_len);
-		log_di = NULL;
-		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
-			log_di = btrfs_lookup_dir_item(trans, log, log_path,
-						       dir_key->objectid,
-						       name, name_len, 0);
-		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
-			log_di = btrfs_lookup_dir_index_item(trans, log,
-						     log_path,
-						     dir_key->objectid,
-						     dir_key->offset,
-						     name, name_len, 0);
-		}
-		if (!log_di) {
-			btrfs_dir_item_key_to_cpu(eb, di, &location);
-			btrfs_release_path(path);
-			btrfs_release_path(log_path);
-			inode = read_one_inode(root, location.objectid);
-			if (!inode) {
-				kfree(name);
-				return -EIO;
-			}
+	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-			ret = link_to_fixup_dir(trans, root,
-						path, location.objectid);
-			if (ret) {
-				kfree(name);
-				iput(inode);
-				goto out;
-			}
+	read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
 
-			inc_nlink(inode);
-			ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
-					BTRFS_I(inode), name, name_len);
-			if (!ret)
-				ret = btrfs_run_delayed_items(trans);
-			kfree(name);
-			iput(inode);
-			if (ret)
-				goto out;
+	if (log) {
+		struct btrfs_dir_item *log_di;
 
-			/* there might still be more names under this key
-			 * check and repeat if required
-			 */
-			ret = btrfs_search_slot(NULL, root, dir_key, path,
-						0, 0);
-			if (ret == 0)
-				goto again;
+		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		if (IS_ERR(log_di)) {
+			ret = PTR_ERR(log_di);
+			goto out;
+		} else if (log_di) {
+			/* The dentry exists in the log, we have nothing to do. */
 			ret = 0;
 			goto out;
-		} else if (IS_ERR(log_di)) {
-			kfree(name);
-			return PTR_ERR(log_di);
 		}
-		btrfs_release_path(log_path);
-		kfree(name);
+	}
 
-		ptr = (unsigned long)(di + 1);
-		ptr += name_len;
+	btrfs_dir_item_key_to_cpu(eb, di, &location);
+	btrfs_release_path(path);
+	btrfs_release_path(log_path);
+	inode = read_one_inode(root, location.objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
 	}
-	ret = 0;
+
+	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	if (ret)
+		goto out;
+
+	inc_nlink(inode);
+	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+				 name_len);
+	if (ret)
+		goto out;
+
+	ret = btrfs_run_delayed_items(trans);
+	if (ret)
+		goto out;
+
+	/*
+	 * Unlike dir item keys, dir index keys can only have one name (entry) in
+	 * them, as there are no key collisions since each key has a unique offset
+	 * (an index number), so we're done.
+	 */
 out:
 	btrfs_release_path(path);
 	btrfs_release_path(log_path);
+	kfree(name);
+	iput(inode);
 	return ret;
 }
 
@@ -2496,7 +2483,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 {
 	u64 range_start;
 	u64 range_end;
-	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
 	int ret = 0;
 	struct btrfs_key dir_key;
 	struct btrfs_key found_key;
@@ -2504,7 +2490,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 	struct inode *dir;
 
 	dir_key.objectid = dirid;
-	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	dir_key.type = BTRFS_DIR_INDEX_KEY;
 	log_path = btrfs_alloc_path();
 	if (!log_path)
 		return -ENOMEM;
@@ -2518,14 +2504,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 		btrfs_free_path(log_path);
 		return 0;
 	}
-again:
+
 	range_start = 0;
 	range_end = 0;
 	while (1) {
 		if (del_all)
 			range_end = (u64)-1;
 		else {
-			ret = find_dir_range(log, path, dirid, key_type,
+			ret = find_dir_range(log, path, dirid,
 					     &range_start, &range_end);
 			if (ret < 0)
 				goto out;
@@ -2552,8 +2538,10 @@ again:
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 					      path->slots[0]);
 			if (found_key.objectid != dirid ||
-			    found_key.type != dir_key.type)
-				goto next_type;
+			    found_key.type != dir_key.type) {
+				ret = 0;
+				goto out;
+			}
 
 			if (found_key.offset > range_end)
 				break;
@@ -2572,15 +2560,7 @@ again:
 			break;
 		range_start = range_end + 1;
 	}
-
-next_type:
 	ret = 0;
-	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
-		key_type = BTRFS_DIR_LOG_INDEX_KEY;
-		dir_key.type = BTRFS_DIR_INDEX_KEY;
-		btrfs_release_path(path);
-		goto again;
-	}
 out:
 	btrfs_release_path(path);
 	btrfs_free_path(log_path);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index e1c4c732aaba..5416f1f1a77a 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -146,7 +146,9 @@
 
 /*
  * dir items are the name -> inode pointers in a directory.  There is one
- * for every name in a directory.
+ * for every name in a directory.  BTRFS_DIR_LOG_ITEM_KEY is no longer used
+ * but it's still defined here for documentation purposes and to help avoid
+ * having its numerical value reused in the future.
  */
 #define BTRFS_DIR_LOG_ITEM_KEY  60
 #define BTRFS_DIR_LOG_INDEX_KEY 72
-- 
cgit v1.2.3-59-g8ed1b


From 227f3cd0d5a157c30480e2cd95f4271be1ace7bc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 21 Oct 2021 14:58:31 -0400
Subject: btrfs: use btrfs_item_size_nr/btrfs_item_offset_nr everywhere

We have this pattern in a lot of places

	item = btrfs_item_nr(slot);
	btrfs_item_size(leaf, item);

when we could simply use

	btrfs_item_size(leaf, slot);

Fix all callers of btrfs_item_size() and btrfs_item_offset() to use the
_nr variation of the helpers.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c                   |  4 +---
 fs/btrfs/ctree.c                     | 21 ++++++---------------
 fs/btrfs/dir-item.c                  |  6 ++----
 fs/btrfs/inode-item.c                |  4 +---
 fs/btrfs/print-tree.c                |  4 +---
 fs/btrfs/send.c                      |  8 ++------
 fs/btrfs/tests/extent-buffer-tests.c | 17 +++++------------
 fs/btrfs/xattr.c                     |  4 +---
 8 files changed, 19 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f735b8798ba1..8066b524916c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2058,7 +2058,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 	u64 parent = 0;
 	int found = 0;
 	struct extent_buffer *eb;
-	struct btrfs_item *item;
 	struct btrfs_inode_ref *iref;
 	struct btrfs_key found_key;
 
@@ -2084,10 +2083,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 		}
 		btrfs_release_path(path);
 
-		item = btrfs_item_nr(slot);
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
 
-		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+		for (cur = 0; cur < btrfs_item_size_nr(eb, slot); cur += len) {
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
 			btrfs_debug(fs_root->fs_info,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f704339c6b86..45b1d30cb0b6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2616,19 +2616,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
  */
 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 {
-	struct btrfs_item *start_item;
-	struct btrfs_item *end_item;
 	int data_len;
 	int nritems = btrfs_header_nritems(l);
 	int end = min(nritems, start + nr) - 1;
 
 	if (!nr)
 		return 0;
-	start_item = btrfs_item_nr(start);
-	end_item = btrfs_item_nr(end);
-	data_len = btrfs_item_offset(l, start_item) +
-		   btrfs_item_size(l, start_item);
-	data_len = data_len - btrfs_item_offset(l, end_item);
+	data_len = btrfs_item_offset_nr(l, start) + btrfs_item_size_nr(l, start);
+	data_len = data_len - btrfs_item_offset_nr(l, end);
 	data_len += sizeof(struct btrfs_item) * nr;
 	WARN_ON(data_len < 0);
 	return data_len;
@@ -2692,8 +2687,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 	slot = path->slots[1];
 	i = left_nritems - 1;
 	while (i >= nr) {
-		item = btrfs_item_nr(i);
-
 		if (!empty && push_items > 0) {
 			if (path->slots[0] > i)
 				break;
@@ -2708,7 +2701,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 		if (path->slots[0] == i)
 			push_space += data_size;
 
-		this_item_size = btrfs_item_size(left, item);
+		this_item_size = btrfs_item_size_nr(left, i);
 		if (this_item_size + sizeof(*item) + push_space > free_space)
 			break;
 
@@ -2919,8 +2912,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 		nr = min(right_nritems - 1, max_slot);
 
 	for (i = 0; i < nr; i++) {
-		item = btrfs_item_nr(i);
-
 		if (!empty && push_items > 0) {
 			if (path->slots[0] < i)
 				break;
@@ -2935,7 +2926,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 		if (path->slots[0] == i)
 			push_space += data_size;
 
-		this_item_size = btrfs_item_size(right, item);
+		this_item_size = btrfs_item_size_nr(right, i);
 		if (this_item_size + sizeof(*item) + push_space > free_space)
 			break;
 
@@ -3502,8 +3493,8 @@ static noinline int split_item(struct btrfs_path *path,
 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
 
 	item = btrfs_item_nr(path->slots[0]);
-	orig_offset = btrfs_item_offset(leaf, item);
-	item_size = btrfs_item_size(leaf, item);
+	orig_offset = btrfs_item_offset_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 
 	buf = kmalloc(item_size, GFP_NOFS);
 	if (!buf)
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7721ce0c0604..7f46c42a26fa 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	char *ptr;
-	struct btrfs_item *item;
 	struct extent_buffer *leaf;
 
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
 	leaf = path->nodes[0];
-	item = btrfs_item_nr(path->slots[0]);
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-	BUG_ON(data_size > btrfs_item_size(leaf, item));
-	ptr += btrfs_item_size(leaf, item) - data_size;
+	ASSERT(data_size <= btrfs_item_size_nr(leaf, path->slots[0]));
+	ptr += btrfs_item_size_nr(leaf, path->slots[0]) - data_size;
 	return (struct btrfs_dir_item *)ptr;
 }
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 37f36ffdaf6b..65111c484d15 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -256,7 +256,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_EXTREF_KEY;
@@ -282,9 +281,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 		goto out;
 
 	leaf = path->nodes[0];
-	item = btrfs_item_nr(path->slots[0]);
 	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
-	ptr += btrfs_item_size(leaf, item) - ins_len;
+	ptr += btrfs_item_size_nr(leaf, path->slots[0]) - ins_len;
 	extref = (struct btrfs_inode_extref *)ptr;
 
 	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index aae1027bd76a..52370af39afe 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l)
 	struct btrfs_fs_info *fs_info;
 	int i;
 	u32 type, nr;
-	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_item *ii;
@@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
 		   btrfs_leaf_free_space(l), btrfs_header_owner(l));
 	print_eb_refs_lock(l);
 	for (i = 0 ; i < nr ; i++) {
-		item = btrfs_item_nr(i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = key.type;
 		pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
 			i, key.objectid, type, key.offset,
-			btrfs_item_offset(l, item), btrfs_item_size(l, item));
+			btrfs_item_offset_nr(l, i), btrfs_item_size_nr(l, i));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 040324d71118..b5d8e498006b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -898,7 +898,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 			     iterate_inode_ref_t iterate, void *ctx)
 {
 	struct extent_buffer *eb = path->nodes[0];
-	struct btrfs_item *item;
 	struct btrfs_inode_ref *iref;
 	struct btrfs_inode_extref *extref;
 	struct btrfs_path *tmp_path;
@@ -930,8 +929,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	if (found_key->type == BTRFS_INODE_REF_KEY) {
 		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
 						    struct btrfs_inode_ref);
-		item = btrfs_item_nr(slot);
-		total = btrfs_item_size(eb, item);
+		total = btrfs_item_size_nr(eb, slot);
 		elem_size = sizeof(*iref);
 	} else {
 		ptr = btrfs_item_ptr_offset(eb, slot);
@@ -1018,7 +1016,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 {
 	int ret = 0;
 	struct extent_buffer *eb;
-	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
 	struct btrfs_key di_key;
 	char *buf = NULL;
@@ -1047,11 +1044,10 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 
 	eb = path->nodes[0];
 	slot = path->slots[0];
-	item = btrfs_item_nr(slot);
 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
 	cur = 0;
 	len = 0;
-	total = btrfs_item_size(eb, item);
+	total = btrfs_item_size_nr(eb, slot);
 
 	num = 0;
 	while (cur < total) {
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 2a95f7224e18..bbef99175564 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	struct btrfs_path *path = NULL;
 	struct btrfs_root *root = NULL;
 	struct extent_buffer *eb;
-	struct btrfs_item *item;
 	char *value = "mary had a little lamb";
 	char *split1 = "mary had a little";
 	char *split2 = " lamb";
@@ -61,7 +60,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	key.offset = 0;
 
 	btrfs_setup_item_for_insert(root, path, &key, value_len);
-	item = btrfs_item_nr(0);
 	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
 			    value_len);
 
@@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	item = btrfs_item_nr(0);
-	if (btrfs_item_size(eb, item) != strlen(split1)) {
+	if (btrfs_item_size_nr(eb, 0) != strlen(split1)) {
 		test_err("invalid len in the first split");
 		ret = -EINVAL;
 		goto out;
@@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	item = btrfs_item_nr(1);
-	if (btrfs_item_size(eb, item) != strlen(split2)) {
+	if (btrfs_item_size_nr(eb, 1) != strlen(split2)) {
 		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
@@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	item = btrfs_item_nr(0);
-	if (btrfs_item_size(eb, item) != strlen(split3)) {
+	if (btrfs_item_size_nr(eb, 0) != strlen(split3)) {
 		test_err("invalid len in the first split");
 		ret = -EINVAL;
 		goto out;
@@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	item = btrfs_item_nr(1);
-	if (btrfs_item_size(eb, item) != strlen(split4)) {
+	if (btrfs_item_size_nr(eb, 1) != strlen(split4)) {
 		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
@@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	item = btrfs_item_nr(2);
-	if (btrfs_item_size(eb, item) != strlen(split2)) {
+	if (btrfs_item_size_nr(eb, 2) != strlen(split2)) {
 		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 2837b4c8424d..0f04bb7f3ce4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -170,7 +170,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		const u16 old_data_len = btrfs_dir_data_len(leaf, di);
 		const u32 item_size = btrfs_item_size_nr(leaf, slot);
 		const u32 data_size = sizeof(*di) + name_len + size;
-		struct btrfs_item *item;
 		unsigned long data_ptr;
 		char *ptr;
 
@@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 			btrfs_extend_item(path, data_size);
 		}
 
-		item = btrfs_item_nr(slot);
 		ptr = btrfs_item_ptr(leaf, slot, char);
-		ptr += btrfs_item_size(leaf, item) - data_size;
+		ptr += btrfs_item_size_nr(leaf, slot) - data_size;
 		di = (struct btrfs_dir_item *)ptr;
 		btrfs_set_dir_data_len(leaf, di, size);
 		data_ptr = ((unsigned long)(di + 1)) + name_len;
-- 
cgit v1.2.3-59-g8ed1b


From c91666b1f61915f2b2f2358f922c9d7d44480d0a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 21 Oct 2021 14:58:32 -0400
Subject: btrfs: add btrfs_set_item_*_nr() helpers

We have the pattern of

	item = btrfs_item_nr(slot);
	btrfs_set_item_*(leaf, item);

in a bunch of places in our code.  Fix this by adding
btrfs_set_item_*_nr() helpers which will do the appropriate work, and
replace those calls with

	btrfs_set_item_*_nr(leaf, slot);

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 24 +++++++++---------------
 fs/btrfs/ctree.h | 12 ++++++++++++
 2 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 45b1d30cb0b6..9e857b74ea3f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3480,9 +3480,7 @@ static noinline int split_item(struct btrfs_path *path,
 			       unsigned long split_offset)
 {
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
-	struct btrfs_item *new_item;
-	int slot;
+	int orig_slot, slot;
 	char *buf;
 	u32 nritems;
 	u32 item_size;
@@ -3492,7 +3490,7 @@ static noinline int split_item(struct btrfs_path *path,
 	leaf = path->nodes[0];
 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
 
-	item = btrfs_item_nr(path->slots[0]);
+	orig_slot = path->slots[0];
 	orig_offset = btrfs_item_offset_nr(leaf, path->slots[0]);
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 
@@ -3515,14 +3513,12 @@ static noinline int split_item(struct btrfs_path *path,
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(leaf, &disk_key, slot);
 
-	new_item = btrfs_item_nr(slot);
-
-	btrfs_set_item_offset(leaf, new_item, orig_offset);
-	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+	btrfs_set_item_offset_nr(leaf, slot, orig_offset);
+	btrfs_set_item_size_nr(leaf, slot, item_size - split_offset);
 
-	btrfs_set_item_offset(leaf, item,
-			      orig_offset + item_size - split_offset);
-	btrfs_set_item_size(leaf, item, split_offset);
+	btrfs_set_item_offset_nr(leaf, orig_slot,
+				 orig_offset + item_size - split_offset);
+	btrfs_set_item_size_nr(leaf, orig_slot, split_offset);
 
 	btrfs_set_header_nritems(leaf, nritems + 1);
 
@@ -3662,8 +3658,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 			fixup_low_keys(path, &disk_key, 1);
 	}
 
-	item = btrfs_item_nr(slot);
-	btrfs_set_item_size(leaf, item, new_size);
+	btrfs_set_item_size_nr(leaf, slot, new_size);
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3727,8 +3722,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 
 	data_end = old_data;
 	old_size = btrfs_item_size_nr(leaf, slot);
-	item = btrfs_item_nr(slot);
-	btrfs_set_item_size(leaf, item, old_size + data_size);
+	btrfs_set_item_size_nr(leaf, slot, old_size + data_size);
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5fe5eccb3c87..cb0871f4cd35 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1997,6 +1997,18 @@ static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
 	return btrfs_item_size(eb, btrfs_item_nr(nr));
 }
 
+static inline void btrfs_set_item_size_nr(struct extent_buffer *eb, int nr,
+					  u32 val)
+{
+	btrfs_set_item_size(eb, btrfs_item_nr(nr), val);
+}
+
+static inline void btrfs_set_item_offset_nr(struct extent_buffer *eb, int nr,
+					    u32 val)
+{
+	btrfs_set_item_offset(eb, btrfs_item_nr(nr), val);
+}
+
 static inline void btrfs_item_key(const struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 437bd07e6c52512f6faf9d3355f34deaa6537ff5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 21 Oct 2021 14:58:33 -0400
Subject: btrfs: make btrfs_file_extent_inline_item_len take a slot

Instead of getting the btrfs_item for this, simply pass in the slot of
the item and then use the btrfs_item_size_nr() helper inside of
btrfs_file_extent_inline_item_len().

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 4 ++--
 fs/btrfs/inode.c | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cb0871f4cd35..e0cb78fa5810 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2534,9 +2534,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
  */
 static inline u32 btrfs_file_extent_inline_item_len(
 						const struct extent_buffer *eb,
-						struct btrfs_item *e)
+						int nr)
 {
-	return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+	return btrfs_item_size_nr(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
 }
 
 /* btrfs_qgroup_status_item */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 549853622b64..e87c6cccf045 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6996,8 +6996,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	WARN_ON(pg_offset != 0);
 	compress_type = btrfs_file_extent_compression(leaf, item);
 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
-	inline_size = btrfs_file_extent_inline_item_len(leaf,
-					btrfs_item_nr(path->slots[0]));
+	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
 	tmp = kmalloc(inline_size, GFP_NOFS);
 	if (!tmp)
 		return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From 747942073608167af3cdbaace2556b13c0cff9f0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 21 Oct 2021 14:58:34 -0400
Subject: btrfs: introduce item_nr token variant helpers

The last remaining place where we have the pattern of

	item = btrfs_item_nr(slot)
	<do something with the item>

are the token helpers.  Handle this by introducing token helpers that
will do the btrfs_item_nr() work inside of the helper itself, and then
convert all users of the btrfs_item token helpers to the new _nr()
variants.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 65 ++++++++++++++++++++++----------------------------------
 fs/btrfs/ctree.h | 32 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e857b74ea3f..dfcc33443f7a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2670,7 +2670,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 	u32 i;
 	int push_space = 0;
 	int push_items = 0;
-	struct btrfs_item *item;
 	u32 nr;
 	u32 right_nritems;
 	u32 data_end;
@@ -2702,11 +2701,12 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 			push_space += data_size;
 
 		this_item_size = btrfs_item_size_nr(left, i);
-		if (this_item_size + sizeof(*item) + push_space > free_space)
+		if (this_item_size + sizeof(struct btrfs_item) +
+		    push_space > free_space)
 			break;
 
 		push_items++;
-		push_space += this_item_size + sizeof(*item);
+		push_space += this_item_size + sizeof(struct btrfs_item);
 		if (i == 0)
 			break;
 		i--;
@@ -2751,9 +2751,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(i);
-		push_space -= btrfs_token_item_size(&token, item);
-		btrfs_set_token_item_offset(&token, item, push_space);
+		push_space -= btrfs_token_item_size_nr(&token, i);
+		btrfs_set_token_item_offset_nr(&token, i, push_space);
 	}
 
 	left_nritems -= push_items;
@@ -2898,7 +2897,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 	int i;
 	int push_space = 0;
 	int push_items = 0;
-	struct btrfs_item *item;
 	u32 old_left_nritems;
 	u32 nr;
 	int ret = 0;
@@ -2927,11 +2925,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 			push_space += data_size;
 
 		this_item_size = btrfs_item_size_nr(right, i);
-		if (this_item_size + sizeof(*item) + push_space > free_space)
+		if (this_item_size + sizeof(struct btrfs_item) + push_space >
+		    free_space)
 			break;
 
 		push_items++;
-		push_space += this_item_size + sizeof(*item);
+		push_space += this_item_size + sizeof(struct btrfs_item);
 	}
 
 	if (push_items == 0) {
@@ -2962,10 +2961,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		u32 ioff;
 
-		item = btrfs_item_nr(i);
-
-		ioff = btrfs_token_item_offset(&token, item);
-		btrfs_set_token_item_offset(&token, item,
+		ioff = btrfs_token_item_offset_nr(&token, i);
+		btrfs_set_token_item_offset_nr(&token, i,
 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
 	}
 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2994,10 +2991,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(i);
-
-		push_space = push_space - btrfs_token_item_size(&token, item);
-		btrfs_set_token_item_offset(&token, item, push_space);
+		push_space = push_space - btrfs_token_item_size_nr(&token, i);
+		btrfs_set_token_item_offset_nr(&token, i, push_space);
 	}
 
 	btrfs_mark_buffer_dirty(left);
@@ -3140,11 +3135,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 
 	btrfs_init_map_token(&token, right);
 	for (i = 0; i < nritems; i++) {
-		struct btrfs_item *item = btrfs_item_nr(i);
 		u32 ioff;
 
-		ioff = btrfs_token_item_offset(&token, item);
-		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
+		ioff = btrfs_token_item_offset_nr(&token, i);
+		btrfs_set_token_item_offset_nr(&token, i, ioff + rt_data_off);
 	}
 
 	btrfs_set_header_nritems(l, mid);
@@ -3579,7 +3573,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 {
 	int slot;
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 	u32 nritems;
 	unsigned int data_end;
 	unsigned int old_data_start;
@@ -3612,10 +3605,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 	btrfs_init_map_token(&token, leaf);
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(i);
 
-		ioff = btrfs_token_item_offset(&token, item);
-		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
+		ioff = btrfs_token_item_offset_nr(&token, i);
+		btrfs_set_token_item_offset_nr(&token, i, ioff + size_diff);
 	}
 
 	/* shift the data */
@@ -3674,7 +3666,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 {
 	int slot;
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 	u32 nritems;
 	unsigned int data_end;
 	unsigned int old_data;
@@ -3709,10 +3700,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 	btrfs_init_map_token(&token, leaf);
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(i);
 
-		ioff = btrfs_token_item_offset(&token, item);
-		btrfs_set_token_item_offset(&token, item, ioff - data_size);
+		ioff = btrfs_token_item_offset_nr(&token, i);
+		btrfs_set_token_item_offset_nr(&token, i, ioff - data_size);
 	}
 
 	/* shift the data */
@@ -3744,7 +3734,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 				   const struct btrfs_item_batch *batch)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_item *item;
 	int i;
 	u32 nritems;
 	unsigned int data_end;
@@ -3797,10 +3786,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 		for (i = slot; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(i);
-			ioff = btrfs_token_item_offset(&token, item);
-			btrfs_set_token_item_offset(&token, item,
-						    ioff - batch->total_data_size);
+			ioff = btrfs_token_item_offset_nr(&token, i);
+			btrfs_set_token_item_offset_nr(&token, i,
+						       ioff - batch->total_data_size);
 		}
 		/* shift the items */
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
@@ -3819,10 +3807,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 	for (i = 0; i < batch->nr; i++) {
 		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(slot + i);
 		data_end -= batch->data_sizes[i];
-		btrfs_set_token_item_offset(&token, item, data_end);
-		btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
+		btrfs_set_token_item_offset_nr(&token, slot + i, data_end);
+		btrfs_set_token_item_size_nr(&token, slot + i, batch->data_sizes[i]);
 	}
 
 	btrfs_set_header_nritems(leaf, nritems + batch->nr);
@@ -4030,7 +4017,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 	u32 last_off;
 	u32 dsize = 0;
 	int ret = 0;
@@ -4059,9 +4045,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		for (i = slot + nr; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(i);
-			ioff = btrfs_token_item_offset(&token, item);
-			btrfs_set_token_item_offset(&token, item, ioff + dsize);
+			ioff = btrfs_token_item_offset_nr(&token, i);
+			btrfs_set_token_item_offset_nr(&token, i, ioff + dsize);
 		}
 
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e0cb78fa5810..53fc20482c8d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2009,6 +2009,38 @@ static inline void btrfs_set_item_offset_nr(struct extent_buffer *eb, int nr,
 	btrfs_set_item_offset(eb, btrfs_item_nr(nr), val);
 }
 
+static inline u32 btrfs_token_item_offset_nr(struct btrfs_map_token *token,
+					     int slot)
+{
+	struct btrfs_item *item = btrfs_item_nr(slot);
+
+	return btrfs_token_item_offset(token, item);
+}
+
+static inline u32 btrfs_token_item_size_nr(struct btrfs_map_token *token,
+					   int slot)
+{
+	struct btrfs_item *item = btrfs_item_nr(slot);
+
+	return btrfs_token_item_size(token, item);
+}
+
+static inline void btrfs_set_token_item_offset_nr(struct btrfs_map_token *token,
+						  int slot, u32 val)
+{
+	struct btrfs_item *item = btrfs_item_nr(slot);
+
+	btrfs_set_token_item_offset(token, item, val);
+}
+
+static inline void btrfs_set_token_item_size_nr(struct btrfs_map_token *token,
+						int slot, u32 val)
+{
+	struct btrfs_item *item = btrfs_item_nr(slot);
+
+	btrfs_set_token_item_size(token, item, val);
+}
+
 static inline void btrfs_item_key(const struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 3212fa14e772913b69e85e080678472f8f1aecde Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 21 Oct 2021 14:58:35 -0400
Subject: btrfs: drop the _nr from the item helpers

Now that all call sites are using the slot number to modify item values,
rename the SETGET helpers to raw_item_*(), and then rework the _nr()
helpers to be the btrfs_item_*() btrfs_set_item_*() helpers, and then
rename all of the callers to the new helpers.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c                   | 14 +++---
 fs/btrfs/ctree.c                     | 86 ++++++++++++++++-----------------
 fs/btrfs/ctree.h                     | 93 +++++++++++++-----------------------
 fs/btrfs/dev-replace.c               |  4 +-
 fs/btrfs/dir-item.c                  | 10 ++--
 fs/btrfs/extent-tree.c               | 14 +++---
 fs/btrfs/file-item.c                 | 24 +++++-----
 fs/btrfs/inode-item.c                | 12 ++---
 fs/btrfs/ioctl.c                     |  6 +--
 fs/btrfs/print-tree.c                |  6 +--
 fs/btrfs/props.c                     |  2 +-
 fs/btrfs/ref-verify.c                |  2 +-
 fs/btrfs/reflink.c                   |  2 +-
 fs/btrfs/relocation.c                |  2 +-
 fs/btrfs/root-tree.c                 |  4 +-
 fs/btrfs/scrub.c                     |  2 +-
 fs/btrfs/send.c                      | 14 +++---
 fs/btrfs/tests/extent-buffer-tests.c | 10 ++--
 fs/btrfs/tree-checker.c              | 48 +++++++++----------
 fs/btrfs/tree-log.c                  | 30 ++++++------
 fs/btrfs/uuid-tree.c                 | 10 ++--
 fs/btrfs/verity.c                    |  2 +-
 fs/btrfs/volumes.c                   |  6 +--
 fs/btrfs/xattr.c                     |  6 +--
 24 files changed, 191 insertions(+), 218 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8066b524916c..c4e0560d4c11 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
-	item_size = btrfs_item_size_nr(leaf, slot);
+	item_size = btrfs_item_size(leaf, slot);
 	BUG_ON(item_size < sizeof(*ei));
 
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
@@ -1779,7 +1779,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	}
 
 	eb = path->nodes[0];
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	item_size = btrfs_item_size(eb, path->slots[0]);
 	BUG_ON(item_size < sizeof(*ei));
 
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
@@ -2085,7 +2085,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
 
-		for (cur = 0; cur < btrfs_item_size_nr(eb, slot); cur += len) {
+		for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
 			btrfs_debug(fs_root->fs_info,
@@ -2141,7 +2141,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 		}
 		btrfs_release_path(path);
 
-		item_size = btrfs_item_size_nr(eb, slot);
+		item_size = btrfs_item_size(eb, slot);
 		ptr = btrfs_item_ptr_offset(eb, slot);
 		cur_offset = 0;
 
@@ -2362,7 +2362,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 	iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
 						    path->slots[0]);
 	iter->end_ptr = (u32)(iter->item_ptr +
-			btrfs_item_size_nr(path->nodes[0], path->slots[0]));
+			btrfs_item_size(path->nodes[0], path->slots[0]));
 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			    struct btrfs_extent_item);
 
@@ -2402,7 +2402,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 		iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
 							   path->slots[0]);
 		iter->item_ptr = iter->cur_ptr;
-		iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
+		iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
 				      path->nodes[0], path->slots[0]));
 	}
 
@@ -2467,7 +2467,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
 	iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
 					path->slots[0]);
 	iter->cur_ptr = iter->item_ptr;
-	iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
+	iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
 						path->slots[0]);
 	return 0;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dfcc33443f7a..353af95c40c1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2622,8 +2622,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 
 	if (!nr)
 		return 0;
-	data_len = btrfs_item_offset_nr(l, start) + btrfs_item_size_nr(l, start);
-	data_len = data_len - btrfs_item_offset_nr(l, end);
+	data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
+	data_len = data_len - btrfs_item_offset(l, end);
 	data_len += sizeof(struct btrfs_item) * nr;
 	WARN_ON(data_len < 0);
 	return data_len;
@@ -2700,7 +2700,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 		if (path->slots[0] == i)
 			push_space += data_size;
 
-		this_item_size = btrfs_item_size_nr(left, i);
+		this_item_size = btrfs_item_size(left, i);
 		if (this_item_size + sizeof(struct btrfs_item) +
 		    push_space > free_space)
 			break;
@@ -2751,8 +2751,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
 	for (i = 0; i < right_nritems; i++) {
-		push_space -= btrfs_token_item_size_nr(&token, i);
-		btrfs_set_token_item_offset_nr(&token, i, push_space);
+		push_space -= btrfs_token_item_size(&token, i);
+		btrfs_set_token_item_offset(&token, i, push_space);
 	}
 
 	left_nritems -= push_items;
@@ -2924,7 +2924,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 		if (path->slots[0] == i)
 			push_space += data_size;
 
-		this_item_size = btrfs_item_size_nr(right, i);
+		this_item_size = btrfs_item_size(right, i);
 		if (this_item_size + sizeof(struct btrfs_item) + push_space >
 		    free_space)
 			break;
@@ -2946,23 +2946,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 			   push_items * sizeof(struct btrfs_item));
 
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
-		     btrfs_item_offset_nr(right, push_items - 1);
+		     btrfs_item_offset(right, push_items - 1);
 
 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
 		     leaf_data_end(left) - push_space,
 		     BTRFS_LEAF_DATA_OFFSET +
-		     btrfs_item_offset_nr(right, push_items - 1),
+		     btrfs_item_offset(right, push_items - 1),
 		     push_space);
 	old_left_nritems = btrfs_header_nritems(left);
 	BUG_ON(old_left_nritems <= 0);
 
 	btrfs_init_map_token(&token, left);
-	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+	old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		u32 ioff;
 
-		ioff = btrfs_token_item_offset_nr(&token, i);
-		btrfs_set_token_item_offset_nr(&token, i,
+		ioff = btrfs_token_item_offset(&token, i);
+		btrfs_set_token_item_offset(&token, i,
 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
 	}
 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2973,7 +2973,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 		       right_nritems);
 
 	if (push_items < right_nritems) {
-		push_space = btrfs_item_offset_nr(right, push_items - 1) -
+		push_space = btrfs_item_offset(right, push_items - 1) -
 						  leaf_data_end(right);
 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
@@ -2991,8 +2991,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
 	for (i = 0; i < right_nritems; i++) {
-		push_space = push_space - btrfs_token_item_size_nr(&token, i);
-		btrfs_set_token_item_offset_nr(&token, i, push_space);
+		push_space = push_space - btrfs_token_item_size(&token, i);
+		btrfs_set_token_item_offset(&token, i, push_space);
 	}
 
 	btrfs_mark_buffer_dirty(left);
@@ -3137,8 +3137,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 	for (i = 0; i < nritems; i++) {
 		u32 ioff;
 
-		ioff = btrfs_token_item_offset_nr(&token, i);
-		btrfs_set_token_item_offset_nr(&token, i, ioff + rt_data_off);
+		ioff = btrfs_token_item_offset(&token, i);
+		btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
 	}
 
 	btrfs_set_header_nritems(l, mid);
@@ -3254,7 +3254,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 
 	l = path->nodes[0];
 	slot = path->slots[0];
-	if (extend && data_size + btrfs_item_size_nr(l, slot) +
+	if (extend && data_size + btrfs_item_size(l, slot) +
 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
 		return -EOVERFLOW;
 
@@ -3423,7 +3423,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	if (btrfs_leaf_free_space(leaf) >= ins_len)
 		return 0;
 
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
@@ -3443,7 +3443,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	ret = -EAGAIN;
 	leaf = path->nodes[0];
 	/* if our item isn't there, return now */
-	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+	if (item_size != btrfs_item_size(leaf, path->slots[0]))
 		goto err;
 
 	/* the leaf has  changed, it now has room.  return now */
@@ -3485,8 +3485,8 @@ static noinline int split_item(struct btrfs_path *path,
 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
 
 	orig_slot = path->slots[0];
-	orig_offset = btrfs_item_offset_nr(leaf, path->slots[0]);
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	orig_offset = btrfs_item_offset(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 
 	buf = kmalloc(item_size, GFP_NOFS);
 	if (!buf)
@@ -3507,12 +3507,12 @@ static noinline int split_item(struct btrfs_path *path,
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(leaf, &disk_key, slot);
 
-	btrfs_set_item_offset_nr(leaf, slot, orig_offset);
-	btrfs_set_item_size_nr(leaf, slot, item_size - split_offset);
+	btrfs_set_item_offset(leaf, slot, orig_offset);
+	btrfs_set_item_size(leaf, slot, item_size - split_offset);
 
-	btrfs_set_item_offset_nr(leaf, orig_slot,
+	btrfs_set_item_offset(leaf, orig_slot,
 				 orig_offset + item_size - split_offset);
-	btrfs_set_item_size_nr(leaf, orig_slot, split_offset);
+	btrfs_set_item_size(leaf, orig_slot, split_offset);
 
 	btrfs_set_header_nritems(leaf, nritems + 1);
 
@@ -3584,14 +3584,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
-	old_size = btrfs_item_size_nr(leaf, slot);
+	old_size = btrfs_item_size(leaf, slot);
 	if (old_size == new_size)
 		return;
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(leaf);
 
-	old_data_start = btrfs_item_offset_nr(leaf, slot);
+	old_data_start = btrfs_item_offset(leaf, slot);
 
 	size_diff = old_size - new_size;
 
@@ -3606,8 +3606,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
 
-		ioff = btrfs_token_item_offset_nr(&token, i);
-		btrfs_set_token_item_offset_nr(&token, i, ioff + size_diff);
+		ioff = btrfs_token_item_offset(&token, i);
+		btrfs_set_token_item_offset(&token, i, ioff + size_diff);
 	}
 
 	/* shift the data */
@@ -3650,7 +3650,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 			fixup_low_keys(path, &disk_key, 1);
 	}
 
-	btrfs_set_item_size_nr(leaf, slot, new_size);
+	btrfs_set_item_size(leaf, slot, new_size);
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3701,8 +3701,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
 
-		ioff = btrfs_token_item_offset_nr(&token, i);
-		btrfs_set_token_item_offset_nr(&token, i, ioff - data_size);
+		ioff = btrfs_token_item_offset(&token, i);
+		btrfs_set_token_item_offset(&token, i, ioff - data_size);
 	}
 
 	/* shift the data */
@@ -3711,8 +3711,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 		      data_end, old_data - data_end);
 
 	data_end = old_data;
-	old_size = btrfs_item_size_nr(leaf, slot);
-	btrfs_set_item_size_nr(leaf, slot, old_size + data_size);
+	old_size = btrfs_item_size(leaf, slot);
+	btrfs_set_item_size(leaf, slot, old_size + data_size);
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3786,8 +3786,8 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 		for (i = slot; i < nritems; i++) {
 			u32 ioff;
 
-			ioff = btrfs_token_item_offset_nr(&token, i);
-			btrfs_set_token_item_offset_nr(&token, i,
+			ioff = btrfs_token_item_offset(&token, i);
+			btrfs_set_token_item_offset(&token, i,
 						       ioff - batch->total_data_size);
 		}
 		/* shift the items */
@@ -3808,8 +3808,8 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
 		data_end -= batch->data_sizes[i];
-		btrfs_set_token_item_offset_nr(&token, slot + i, data_end);
-		btrfs_set_token_item_size_nr(&token, slot + i, batch->data_sizes[i]);
+		btrfs_set_token_item_offset(&token, slot + i, data_end);
+		btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
 	}
 
 	btrfs_set_header_nritems(leaf, nritems + batch->nr);
@@ -3916,7 +3916,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 	u32 item_size;
 
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 	ret = setup_leaf_for_split(trans, root, path,
 				   item_size + sizeof(struct btrfs_item));
 	if (ret)
@@ -4025,10 +4025,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	u32 nritems;
 
 	leaf = path->nodes[0];
-	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+	last_off = btrfs_item_offset(leaf, slot + nr - 1);
 
 	for (i = 0; i < nr; i++)
-		dsize += btrfs_item_size_nr(leaf, slot + i);
+		dsize += btrfs_item_size(leaf, slot + i);
 
 	nritems = btrfs_header_nritems(leaf);
 
@@ -4045,8 +4045,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		for (i = slot + nr; i < nritems; i++) {
 			u32 ioff;
 
-			ioff = btrfs_token_item_offset_nr(&token, i);
-			btrfs_set_token_item_offset_nr(&token, i, ioff + dsize);
+			ioff = btrfs_token_item_offset(&token, i);
+			btrfs_set_token_item_offset(&token, i, ioff + dsize);
 		}
 
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 53fc20482c8d..8f4807b7e1d0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1960,8 +1960,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb,
 }
 
 /* struct btrfs_item */
-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
 
@@ -1979,7 +1979,7 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
 static inline u32 btrfs_item_end(const struct extent_buffer *eb,
 				 struct btrfs_item *item)
 {
-	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+	return btrfs_raw_item_offset(eb, item) + btrfs_raw_item_size(eb, item);
 }
 
 static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
@@ -1987,59 +1987,32 @@ static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
 	return btrfs_item_end(eb, btrfs_item_nr(nr));
 }
 
-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
-{
-	return btrfs_item_offset(eb, btrfs_item_nr(nr));
-}
-
-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
-{
-	return btrfs_item_size(eb, btrfs_item_nr(nr));
-}
-
-static inline void btrfs_set_item_size_nr(struct extent_buffer *eb, int nr,
-					  u32 val)
-{
-	btrfs_set_item_size(eb, btrfs_item_nr(nr), val);
-}
-
-static inline void btrfs_set_item_offset_nr(struct extent_buffer *eb, int nr,
-					    u32 val)
-{
-	btrfs_set_item_offset(eb, btrfs_item_nr(nr), val);
-}
-
-static inline u32 btrfs_token_item_offset_nr(struct btrfs_map_token *token,
-					     int slot)
-{
-	struct btrfs_item *item = btrfs_item_nr(slot);
-
-	return btrfs_token_item_offset(token, item);
-}
-
-static inline u32 btrfs_token_item_size_nr(struct btrfs_map_token *token,
-					   int slot)
-{
-	struct btrfs_item *item = btrfs_item_nr(slot);
-
-	return btrfs_token_item_size(token, item);
-}
-
-static inline void btrfs_set_token_item_offset_nr(struct btrfs_map_token *token,
-						  int slot, u32 val)
-{
-	struct btrfs_item *item = btrfs_item_nr(slot);
-
-	btrfs_set_token_item_offset(token, item, val);
-}
-
-static inline void btrfs_set_token_item_size_nr(struct btrfs_map_token *token,
-						int slot, u32 val)
-{
-	struct btrfs_item *item = btrfs_item_nr(slot);
-
-	btrfs_set_token_item_size(token, item, val);
-}
+#define BTRFS_ITEM_SETGET_FUNCS(member)						\
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb,		\
+				      int slot)					\
+{										\
+	return btrfs_raw_item_##member(eb, btrfs_item_nr(slot));		\
+}										\
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb,	\
+					   int slot, u32 val)			\
+{										\
+	btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val);		\
+}										\
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token,	\
+					    int slot)				\
+{										\
+	struct btrfs_item *item = btrfs_item_nr(slot);				\
+	return btrfs_token_raw_item_##member(token, item);			\
+}										\
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,	\
+						 int slot, u32 val)		\
+{										\
+	struct btrfs_item *item = btrfs_item_nr(slot);				\
+	btrfs_set_token_raw_item_##member(token, item, val);			\
+}
+
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
 
 static inline void btrfs_item_key(const struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
@@ -2507,7 +2480,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
 
 	if (nr == 0)
 		return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
-	return btrfs_item_offset_nr(leaf, nr - 1);
+	return btrfs_item_offset(leaf, nr - 1);
 }
 
 /* struct btrfs_file_extent_item */
@@ -2568,7 +2541,7 @@ static inline u32 btrfs_file_extent_inline_item_len(
 						const struct extent_buffer *eb,
 						int nr)
 {
-	return btrfs_item_size_nr(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+	return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
 }
 
 /* btrfs_qgroup_status_item */
@@ -2660,11 +2633,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(BTRFS_LEAF_DATA_OFFSET + \
-	btrfs_item_offset_nr(leaf, slot)))
+	btrfs_item_offset(leaf, slot)))
 
 #define btrfs_item_ptr_offset(leaf, slot) \
 	((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
-	btrfs_item_offset_nr(leaf, slot)))
+	btrfs_item_offset(leaf, slot)))
 
 static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
 {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c85a7d44da79..82769f1c17ee 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -128,7 +128,7 @@ no_valid_dev_replace_entry_found:
 	}
 	slot = path->slots[0];
 	eb = path->nodes[0];
-	item_size = btrfs_item_size_nr(eb, slot);
+	item_size = btrfs_item_size(eb, slot);
 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
 
 	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
@@ -381,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
 	}
 
 	if (ret == 0 &&
-	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
 		/*
 		 * need to delete old one and insert a new one.
 		 * Since no attempt is made to recover any old state, if the
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7f46c42a26fa..3b532bab0755 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -41,8 +41,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	WARN_ON(ret > 0);
 	leaf = path->nodes[0];
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-	ASSERT(data_size <= btrfs_item_size_nr(leaf, path->slots[0]));
-	ptr += btrfs_item_size_nr(leaf, path->slots[0]) - data_size;
+	ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
+	ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
 	return (struct btrfs_dir_item *)ptr;
 }
 
@@ -269,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 	data_size = sizeof(*di) + name_len;
 	leaf = path->nodes[0];
 	slot = path->slots[0];
-	if (data_size + btrfs_item_size_nr(leaf, slot) +
+	if (data_size + btrfs_item_size(leaf, slot) +
 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
 		ret = -EOVERFLOW;
 	} else {
@@ -407,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
 	leaf = path->nodes[0];
 	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 
-	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	total_len = btrfs_item_size(leaf, path->slots[0]);
 	while (cur < total_len) {
 		this_len = sizeof(*dir_item) +
 			btrfs_dir_name_len(leaf, dir_item) +
@@ -443,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
 		btrfs_dir_data_len(leaf, di);
-	item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_len = btrfs_item_size(leaf, path->slots[0]);
 	if (sub_item_len == item_len) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 25ef6e3fd306..9dd7aaf7cd24 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -171,7 +171,7 @@ search_again:
 
 	if (ret == 0) {
 		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		item_size = btrfs_item_size(leaf, path->slots[0]);
 		if (item_size >= sizeof(*ei)) {
 			ei = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_extent_item);
@@ -865,7 +865,7 @@ again:
 	}
 
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 	if (unlikely(item_size < sizeof(*ei))) {
 		err = -EINVAL;
 		btrfs_print_v0_err(fs_info);
@@ -1007,7 +1007,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 		__run_delayed_extent_op(extent_op, leaf, ei);
 
 	ptr = (unsigned long)ei + item_offset;
-	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
 	if (ptr < end - size)
 		memmove_extent_buffer(leaf, ptr + size, ptr,
 				      end - size - ptr);
@@ -1119,7 +1119,7 @@ void update_inline_extent_backref(struct btrfs_path *path,
 	} else {
 		*last_ref = 1;
 		size =  btrfs_extent_inline_ref_size(type);
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		item_size = btrfs_item_size(leaf, path->slots[0]);
 		ptr = (unsigned long)iref;
 		end = (unsigned long)ei + item_size;
 		if (ptr + size < end)
@@ -1634,7 +1634,7 @@ again:
 	}
 
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 
 	if (unlikely(item_size < sizeof(*ei))) {
 		err = -EINVAL;
@@ -2316,7 +2316,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 		goto out;
 
 	ret = 1;
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 
 	/* If extent item has more than 1 inline ref then it's shared */
@@ -3068,7 +3068,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	}
 
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, extent_slot);
+	item_size = btrfs_item_size(leaf, extent_slot);
 	if (unlikely(item_size < sizeof(*ei))) {
 		ret = -EINVAL;
 		btrfs_print_v0_err(info);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d1cbb64a78f3..0f2e2ab34828 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 
 		csum_offset = (bytenr - found_key.offset) >>
 				fs_info->sectorsize_bits;
-		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+		csums_in_item = btrfs_item_size(leaf, path->slots[0]);
 		csums_in_item /= csum_size;
 
 		if (csum_offset == csums_in_item) {
@@ -274,7 +274,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				      struct btrfs_csum_item);
 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
 
 		csum_start = key.offset;
 		csum_len = (itemsize / csum_size) * sectorsize;
@@ -291,7 +291,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
 		goto out;
 	}
 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-	itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+	itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
 
 	csum_start = key.offset;
 	csum_len = (itemsize / csum_size) * sectorsize;
@@ -534,7 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 		    key.type == BTRFS_EXTENT_CSUM_KEY) {
 			offset = (start - key.offset) >> fs_info->sectorsize_bits;
 			if (offset * csum_size <
-			    btrfs_item_size_nr(leaf, path->slots[0] - 1))
+			    btrfs_item_size(leaf, path->slots[0] - 1))
 				path->slots[0]--;
 		}
 	}
@@ -559,7 +559,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 		if (key.offset > start)
 			start = key.offset;
 
-		size = btrfs_item_size_nr(leaf, path->slots[0]);
+		size = btrfs_item_size(leaf, path->slots[0]);
 		csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
 		if (csum_end <= start) {
 			path->slots[0]++;
@@ -750,7 +750,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
 	u32 blocksize_bits = fs_info->sectorsize_bits;
 
 	leaf = path->nodes[0];
-	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+	csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
 	csum_end <<= blocksize_bits;
 	csum_end += key->offset;
 
@@ -834,7 +834,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		if (key.offset >= end_byte)
 			break;
 
-		csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+		csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
 		csum_end <<= blocksize_bits;
 		csum_end += key.offset;
 
@@ -1002,7 +1002,7 @@ again:
 		item_end = btrfs_item_ptr(leaf, path->slots[0],
 					  struct btrfs_csum_item);
 		item_end = (struct btrfs_csum_item *)((char *)item_end +
-			   btrfs_item_size_nr(leaf, path->slots[0]));
+			   btrfs_item_size(leaf, path->slots[0]));
 		goto found;
 	}
 	ret = PTR_ERR(item);
@@ -1013,7 +1013,7 @@ again:
 		u32 item_size;
 		/* we found one, but it isn't big enough yet */
 		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		item_size = btrfs_item_size(leaf, path->slots[0]);
 		if ((item_size / csum_size) >=
 		    MAX_CSUM_ITEMS(fs_info, csum_size)) {
 			/* already at max size, make a new one */
@@ -1070,7 +1070,7 @@ again:
 	}
 
 extend_csum:
-	if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
+	if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
 	    csum_size) {
 		int extend_nr;
 		u64 tmp;
@@ -1125,7 +1125,7 @@ extend_csum:
 		diff = min(diff,
 			   MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
 
-		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+		diff = diff - btrfs_item_size(leaf, path->slots[0]);
 		diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
 		diff /= csum_size;
 		diff *= csum_size;
@@ -1162,7 +1162,7 @@ insert:
 csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item +
-				      btrfs_item_size_nr(leaf, path->slots[0]));
+				      btrfs_item_size(leaf, path->slots[0]));
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
 					  csum_offset * csum_size);
 found:
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 65111c484d15..56755ce9a907 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -19,7 +19,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
 	u32 cur_offset = 0;
 	int len;
 
-	item_size = btrfs_item_size_nr(leaf, slot);
+	item_size = btrfs_item_size(leaf, slot);
 	ptr = btrfs_item_ptr_offset(leaf, slot);
 	while (cur_offset < item_size) {
 		ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
@@ -45,7 +45,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
 	u32 cur_offset = 0;
 	int ref_name_len;
 
-	item_size = btrfs_item_size_nr(leaf, slot);
+	item_size = btrfs_item_size(leaf, slot);
 	ptr = btrfs_item_ptr_offset(leaf, slot);
 
 	/*
@@ -139,7 +139,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	}
 
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 	if (index)
 		*index = btrfs_inode_extref_index(leaf, extref);
 
@@ -208,7 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 
 	if (index)
 		*index = btrfs_inode_ref_index(leaf, ref);
@@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 
 	leaf = path->nodes[0];
 	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
-	ptr += btrfs_item_size_nr(leaf, path->slots[0]) - ins_len;
+	ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
 	extref = (struct btrfs_inode_extref *)ptr;
 
 	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
@@ -330,7 +330,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		if (ref)
 			goto out;
 
-		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
 		btrfs_extend_item(path, ins_len);
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index edfecfe62b4b..441d5130a4ec 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2084,7 +2084,7 @@ static noinline int copy_to_sk(struct btrfs_path *path,
 
 	for (i = slot; i < nritems; i++) {
 		item_off = btrfs_item_ptr_offset(leaf, i);
-		item_len = btrfs_item_size_nr(leaf, i);
+		item_len = btrfs_item_size(leaf, i);
 
 		btrfs_item_key_to_cpu(leaf, key, i);
 		if (!key_in_sk(key, sk))
@@ -2538,7 +2538,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
 	btrfs_item_key_to_cpu(leaf, &key, slot);
 
 	item_off = btrfs_item_ptr_offset(leaf, slot);
-	item_len = btrfs_item_size_nr(leaf, slot);
+	item_len = btrfs_item_size(leaf, slot);
 	/* Check if dirid in ROOT_REF corresponds to passed dirid */
 	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
 	if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
@@ -2740,7 +2740,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
 
 			item_off = btrfs_item_ptr_offset(leaf, slot)
 					+ sizeof(struct btrfs_root_ref);
-			item_len = btrfs_item_size_nr(leaf, slot)
+			item_len = btrfs_item_size(leaf, slot)
 					- sizeof(struct btrfs_root_ref);
 			read_extent_buffer(leaf, subvol_info->name,
 					   item_off, item_len);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 52370af39afe..0775ae9f4419 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
 	struct btrfs_disk_key key;
 	unsigned long end;
 	unsigned long ptr;
-	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u32 item_size = btrfs_item_size(eb, slot);
 	u64 flags;
 	u64 offset;
 	int ref_index = 0;
@@ -227,7 +227,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
 		type = key.type;
 		pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
 			i, key.objectid, type, key.offset,
-			btrfs_item_offset_nr(l, i), btrfs_item_size_nr(l, i));
+			btrfs_item_offset(l, i), btrfs_item_size(l, i));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
@@ -345,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
 		case BTRFS_UUID_KEY_SUBVOL:
 		case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
 			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
-					btrfs_item_size_nr(l, i));
+					btrfs_item_size(l, i));
 			break;
 		}
 	}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b1cb5a8c2999..a978676aa627 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root,
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		cur = 0;
-		total_len = btrfs_item_size_nr(leaf, slot);
+		total_len = btrfs_item_size(leaf, slot);
 
 		while (cur < total_len) {
 			u32 name_len = btrfs_dir_name_len(leaf, di);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e2b9f8616501..f34130d90dee 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
 	struct btrfs_extent_data_ref *dref;
 	struct btrfs_shared_data_ref *sref;
 	struct extent_buffer *leaf = path->nodes[0];
-	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u32 item_size = btrfs_item_size(leaf, slot);
 	unsigned long end, ptr;
 	u64 offset, flags, count;
 	int type, ret;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index e0f93b357548..a3930da4eb3f 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -439,7 +439,7 @@ process_slot:
 			break;
 		}
 		next_key_min_offset = key.offset + datal;
-		size = btrfs_item_size_nr(leaf, slot);
+		size = btrfs_item_size(leaf, slot);
 		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
 				   size);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 33a0ee7ac590..ee0a0efc7efd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3149,7 +3149,7 @@ static int add_tree_block(struct reloc_control *rc,
 	u64 owner = 0;
 
 	eb =  path->nodes[0];
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	item_size = btrfs_item_size(eb, path->slots[0]);
 
 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index d20166336557..3297368aa359 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 	u32 len;
 	int need_reset = 0;
 
-	len = btrfs_item_size_nr(eb, slot);
+	len = btrfs_item_size(eb, slot);
 	read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
 			   min_t(u32, len, sizeof(*item)));
 	if (len < sizeof(*item))
@@ -146,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	l = path->nodes[0];
 	slot = path->slots[0];
 	ptr = btrfs_item_ptr_offset(l, slot);
-	old_len = btrfs_item_size_nr(l, slot);
+	old_len = btrfs_item_size(l, slot);
 
 	/*
 	 * If this is the first time we update the root item which originated
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 8f6ceea33969..d175c5ab1134 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -758,7 +758,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 
 	eb = path->nodes[0];
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	item_size = btrfs_item_size(eb, path->slots[0]);
 
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b5d8e498006b..6e2708917fc1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -929,11 +929,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	if (found_key->type == BTRFS_INODE_REF_KEY) {
 		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
 						    struct btrfs_inode_ref);
-		total = btrfs_item_size_nr(eb, slot);
+		total = btrfs_item_size(eb, slot);
 		elem_size = sizeof(*iref);
 	} else {
 		ptr = btrfs_item_ptr_offset(eb, slot);
-		total = btrfs_item_size_nr(eb, slot);
+		total = btrfs_item_size(eb, slot);
 		elem_size = sizeof(*extref);
 	}
 
@@ -1047,7 +1047,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
 	cur = 0;
 	len = 0;
-	total = btrfs_item_size_nr(eb, slot);
+	total = btrfs_item_size(eb, slot);
 
 	num = 0;
 	while (cur < total) {
@@ -3618,7 +3618,7 @@ static int is_ancestor(struct btrfs_root *root,
 		    key.type != BTRFS_INODE_EXTREF_KEY)
 			break;
 
-		item_size = btrfs_item_size_nr(leaf, slot);
+		item_size = btrfs_item_size(leaf, slot);
 		while (cur_offset < item_size) {
 			u64 parent;
 			u64 parent_gen;
@@ -6562,7 +6562,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
 	}
 
 	leaf = path->nodes[0];
-	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	item_size = btrfs_item_size(leaf, path->slots[0]);
 	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	while (cur_offset < item_size) {
 		extref = (struct btrfs_inode_extref *)(ptr +
@@ -6787,8 +6787,8 @@ static int tree_compare_item(struct btrfs_path *left_path,
 	int len1, len2;
 	unsigned long off1, off2;
 
-	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
-	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+	len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
+	len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
 	if (len1 != len2)
 		return 1;
 
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index bbef99175564..51a8b075c259 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -88,7 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	if (btrfs_item_size_nr(eb, 0) != strlen(split1)) {
+	if (btrfs_item_size(eb, 0) != strlen(split1)) {
 		test_err("invalid len in the first split");
 		ret = -EINVAL;
 		goto out;
@@ -112,7 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	if (btrfs_item_size_nr(eb, 1) != strlen(split2)) {
+	if (btrfs_item_size(eb, 1) != strlen(split2)) {
 		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
@@ -143,7 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	if (btrfs_item_size_nr(eb, 0) != strlen(split3)) {
+	if (btrfs_item_size(eb, 0) != strlen(split3)) {
 		test_err("invalid len in the first split");
 		ret = -EINVAL;
 		goto out;
@@ -166,7 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	if (btrfs_item_size_nr(eb, 1) != strlen(split4)) {
+	if (btrfs_item_size(eb, 1) != strlen(split4)) {
 		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
@@ -189,7 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	if (btrfs_item_size_nr(eb, 2) != strlen(split2)) {
+	if (btrfs_item_size(eb, 2) != strlen(split2)) {
 		test_err("invalid len in the second split");
 		ret = -EINVAL;
 		goto out;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7733e8ac0a69..09512d79e687 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_file_extent_item *fi;
 	u32 sectorsize = fs_info->sectorsize;
-	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u32 item_size = btrfs_item_size(leaf, slot);
 	u64 extent_end;
 
 	if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
@@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
 			key->offset, sectorsize);
 		return -EUCLEAN;
 	}
-	if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) {
+	if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
 		generic_err(leaf, slot,
 	"unaligned item size for csum item, have %u should be aligned to %u",
-			btrfs_item_size_nr(leaf, slot), csumsize);
+			btrfs_item_size(leaf, slot), csumsize);
 		return -EUCLEAN;
 	}
 	if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
 		u64 prev_csum_end;
 		u32 prev_item_size;
 
-		prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
+		prev_item_size = btrfs_item_size(leaf, slot - 1);
 		prev_csum_end = (prev_item_size / csumsize) * sectorsize;
 		prev_csum_end += prev_key->offset;
 		if (unlikely(prev_csum_end > key->offset)) {
@@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf,
 {
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_dir_item *di;
-	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u32 item_size = btrfs_item_size(leaf, slot);
 	u32 cur = 0;
 
 	if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
@@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
 				  struct btrfs_key *key, int slot)
 {
 	struct btrfs_block_group_item bgi;
-	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u32 item_size = btrfs_item_size(leaf, slot);
 	u64 flags;
 	u64 type;
 
@@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
 {
 	int num_stripes;
 
-	if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) {
+	if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
 		chunk_err(leaf, chunk, key->offset,
 			"invalid chunk item size: have %u expect [%zu, %u)",
-			btrfs_item_size_nr(leaf, slot),
+			btrfs_item_size(leaf, slot),
 			sizeof(struct btrfs_chunk),
 			BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
 		return -EUCLEAN;
@@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
 		goto out;
 
 	if (unlikely(btrfs_chunk_item_size(num_stripes) !=
-		     btrfs_item_size_nr(leaf, slot))) {
+		     btrfs_item_size(leaf, slot))) {
 		chunk_err(leaf, chunk, key->offset,
 			"invalid chunk item size: have %u expect %lu",
-			btrfs_item_size_nr(leaf, slot),
+			btrfs_item_size(leaf, slot),
 			btrfs_chunk_item_size(num_stripes));
 		return -EUCLEAN;
 	}
@@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
 	if (unlikely(ret < 0))
 		return ret;
 
-	if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
-		     btrfs_item_size_nr(leaf, slot) !=
+	if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
+		     btrfs_item_size(leaf, slot) !=
 		     btrfs_legacy_root_item_size())) {
 		generic_err(leaf, slot,
 			    "invalid root item size, have %u expect %zu or %u",
-			    btrfs_item_size_nr(leaf, slot), sizeof(ri),
+			    btrfs_item_size(leaf, slot), sizeof(ri),
 			    btrfs_legacy_root_item_size());
 		return -EUCLEAN;
 	}
@@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
 	 * And since we allow geneartion_v2 as 0, it will still pass the check.
 	 */
 	read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
-			   btrfs_item_size_nr(leaf, slot));
+			   btrfs_item_size(leaf, slot));
 
 	/* Generation related */
 	if (unlikely(btrfs_root_generation(&ri) >
@@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf,
 	bool is_tree_block = false;
 	unsigned long ptr;	/* Current pointer inside inline refs */
 	unsigned long end;	/* Extent item end */
-	const u32 item_size = btrfs_item_size_nr(leaf, slot);
+	const u32 item_size = btrfs_item_size(leaf, slot);
 	u64 flags;
 	u64 generation;
 	u64 total_refs;		/* Total refs in btrfs_extent_item */
@@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
 	if (key->type == BTRFS_SHARED_DATA_REF_KEY)
 		expect_item_size = sizeof(struct btrfs_shared_data_ref);
 
-	if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) {
+	if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
 		generic_err(leaf, slot,
 		"invalid item size, have %u expect %u for key type %u",
-			    btrfs_item_size_nr(leaf, slot),
+			    btrfs_item_size(leaf, slot),
 			    expect_item_size, key->type);
 		return -EUCLEAN;
 	}
@@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
 {
 	struct btrfs_extent_data_ref *dref;
 	unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
-	const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
+	const unsigned long end = ptr + btrfs_item_size(leaf, slot);
 
-	if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) {
+	if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
 		generic_err(leaf, slot,
 	"invalid item size, have %u expect aligned to %zu for key type %u",
-			    btrfs_item_size_nr(leaf, slot),
+			    btrfs_item_size(leaf, slot),
 			    sizeof(*dref), key->type);
 		return -EUCLEAN;
 	}
@@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf,
 	if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
 		return -EUCLEAN;
 	/* namelen can't be 0, so item_size == sizeof() is also invalid */
-	if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) {
+	if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
 		inode_ref_err(leaf, slot,
 			"invalid item size, have %u expect (%zu, %u)",
-			btrfs_item_size_nr(leaf, slot),
+			btrfs_item_size(leaf, slot),
 			sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
 		return -EUCLEAN;
 	}
 
 	ptr = btrfs_item_ptr_offset(leaf, slot);
-	end = ptr + btrfs_item_size_nr(leaf, slot);
+	end = ptr + btrfs_item_size(leaf, slot);
 	while (ptr < end) {
 		u16 namelen;
 
@@ -1689,7 +1689,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 		if (slot == 0)
 			item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
 		else
-			item_end_expected = btrfs_item_offset_nr(leaf,
+			item_end_expected = btrfs_item_offset(leaf,
 								 slot - 1);
 		if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
 			generic_err(leaf, slot,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3da8452f682c..9165486b554e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -386,7 +386,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		overwrite_root = 1;
 
-	item_size = btrfs_item_size_nr(eb, slot);
+	item_size = btrfs_item_size(eb, slot);
 	src_ptr = btrfs_item_ptr_offset(eb, slot);
 
 	/* Our caller must have done a search for the key for us. */
@@ -409,7 +409,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
 	if (ret == 0) {
 		char *src_copy;
 		char *dst_copy;
-		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+		u32 dst_size = btrfs_item_size(path->nodes[0],
 						  path->slots[0]);
 		if (dst_size != item_size)
 			goto insert;
@@ -503,7 +503,7 @@ insert:
 	/* make sure any existing item is the correct size */
 	if (ret == -EEXIST || ret == -EOVERFLOW) {
 		u32 found_size;
-		found_size = btrfs_item_size_nr(path->nodes[0],
+		found_size = btrfs_item_size(path->nodes[0],
 						path->slots[0]);
 		if (found_size > item_size)
 			btrfs_truncate_item(path, item_size, 1);
@@ -1096,7 +1096,7 @@ again:
 		 * otherwise they must be unlinked as a conflict
 		 */
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
-		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
 		while (ptr < ptr_end) {
 			victim_ref = (struct btrfs_inode_ref *)ptr;
 			victim_name_len = btrfs_inode_ref_name_len(leaf,
@@ -1155,7 +1155,7 @@ again:
 
 		leaf = path->nodes[0];
 
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		item_size = btrfs_item_size(leaf, path->slots[0]);
 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
 
 		while (cur_offset < item_size) {
@@ -1318,7 +1318,7 @@ again:
 
 	eb = path->nodes[0];
 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
-	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
+	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
 	while (ref_ptr < ref_end) {
 		char *name = NULL;
 		int namelen;
@@ -1504,7 +1504,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	int ref_struct_size;
 
 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size(eb, slot);
 
 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
 		struct btrfs_inode_extref *r;
@@ -1678,7 +1678,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
 			break;
 
 		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		item_size = btrfs_item_size(leaf, path->slots[0]);
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		cur_offset = 0;
 
@@ -1732,7 +1732,7 @@ process_slot:
 		    key.type != BTRFS_INODE_REF_KEY)
 			break;
 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
-		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+		ptr_end = ptr + btrfs_item_size(path->nodes[0],
 						   path->slots[0]);
 		while (ptr < ptr_end) {
 			struct btrfs_inode_ref *ref;
@@ -2406,7 +2406,7 @@ process_leaf:
 		}
 
 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
-		total_size = btrfs_item_size_nr(path->nodes[0], i);
+		total_size = btrfs_item_size(path->nodes[0], i);
 		cur = 0;
 		while (cur < total_size) {
 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
@@ -3640,7 +3640,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 
 	if (count == 1) {
 		btrfs_item_key_to_cpu(src, &key, start_slot);
-		item_size = btrfs_item_size_nr(src, start_slot);
+		item_size = btrfs_item_size(src, start_slot);
 		batch.keys = &key;
 		batch.data_sizes = &item_size;
 		batch.total_data_size = item_size;
@@ -3663,7 +3663,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 			const int slot = start_slot + i;
 
 			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
-			ins_sizes[i] = btrfs_item_size_nr(src, slot);
+			ins_sizes[i] = btrfs_item_size(src, slot);
 			batch.total_data_size += ins_sizes[i];
 		}
 	}
@@ -4296,7 +4296,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	batch.nr = nr;
 
 	for (i = 0; i < nr; i++) {
-		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+		ins_sizes[i] = btrfs_item_size(src, i + start_slot);
 		batch.total_data_size += ins_sizes[i];
 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
 	}
@@ -5112,7 +5112,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 	struct btrfs_path *search_path;
 	char *name = NULL;
 	u32 name_len = 0;
-	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u32 item_size = btrfs_item_size(eb, slot);
 	u32 cur_offset = 0;
 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
 
@@ -6033,7 +6033,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
 			break;
 
-		item_size = btrfs_item_size_nr(leaf, slot);
+		item_size = btrfs_item_size(leaf, slot);
 		ptr = btrfs_item_ptr_offset(leaf, slot);
 		while (cur_offset < item_size) {
 			struct btrfs_key inode_key;
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 74023c8a783f..b458452a1aaf 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
 
 	eb = path->nodes[0];
 	slot = path->slots[0];
-	item_size = btrfs_item_size_nr(eb, slot);
+	item_size = btrfs_item_size(eb, slot);
 	offset = btrfs_item_ptr_offset(eb, slot);
 	ret = -ENOENT;
 
@@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		offset = btrfs_item_ptr_offset(eb, slot);
-		offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+		offset += btrfs_item_size(eb, slot) - sizeof(subid_le);
 	} else {
 		btrfs_warn(fs_info,
 			   "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
@@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 	eb = path->nodes[0];
 	slot = path->slots[0];
 	offset = btrfs_item_ptr_offset(eb, slot);
-	item_size = btrfs_item_size_nr(eb, slot);
+	item_size = btrfs_item_size(eb, slot);
 	if (!IS_ALIGNED(item_size, sizeof(u64))) {
 		btrfs_warn(fs_info, "uuid item with illegal size %lu!",
 			   (unsigned long)item_size);
@@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 		goto out;
 	}
 
-	item_size = btrfs_item_size_nr(eb, slot);
+	item_size = btrfs_item_size(eb, slot);
 	if (item_size == sizeof(subid)) {
 		ret = btrfs_del_item(trans, uuid_root, path);
 		goto out;
@@ -331,7 +331,7 @@ again_search_slot:
 			goto skip;
 
 		offset = btrfs_item_ptr_offset(leaf, slot);
-		item_size = btrfs_item_size_nr(leaf, slot);
+		item_size = btrfs_item_size(leaf, slot);
 		if (!IS_ALIGNED(item_size, sizeof(u64))) {
 			btrfs_warn(fs_info,
 				   "uuid item with illegal size %lu!",
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 4968535dfff0..90eb5c2830a9 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
 		if (key.objectid != btrfs_ino(inode) || key.type != key_type)
 			break;
 
-		item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+		item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
 
 		if (copied > 0) {
 			/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fd0ced829edb..945cc6e72125 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4643,7 +4643,7 @@ int btrfs_uuid_scan_kthread(void *data)
 
 		eb = path->nodes[0];
 		slot = path->slots[0];
-		item_size = btrfs_item_size_nr(eb, slot);
+		item_size = btrfs_item_size(eb, slot);
 		if (item_size < sizeof(root_item))
 			goto skip;
 
@@ -7730,7 +7730,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
 	}
 	slot = path->slots[0];
 	eb = path->nodes[0];
-	item_size = btrfs_item_size_nr(eb, slot);
+	item_size = btrfs_item_size(eb, slot);
 
 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
 
@@ -7808,7 +7808,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	}
 
 	if (ret == 0 &&
-	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
 		/* need to delete old one and insert a new one */
 		ret = btrfs_del_item(trans, dev_root, path);
 		if (ret != 0) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 0f04bb7f3ce4..99abf41b89b9 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -168,7 +168,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		const int slot = path->slots[0];
 		struct extent_buffer *leaf = path->nodes[0];
 		const u16 old_data_len = btrfs_dir_data_len(leaf, di);
-		const u32 item_size = btrfs_item_size_nr(leaf, slot);
+		const u32 item_size = btrfs_item_size(leaf, slot);
 		const u32 data_size = sizeof(*di) + name_len + size;
 		unsigned long data_ptr;
 		char *ptr;
@@ -196,7 +196,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		}
 
 		ptr = btrfs_item_ptr(leaf, slot, char);
-		ptr += btrfs_item_size_nr(leaf, slot) - data_size;
+		ptr += btrfs_item_size(leaf, slot) - data_size;
 		di = (struct btrfs_dir_item *)ptr;
 		btrfs_set_dir_data_len(leaf, di, size);
 		data_ptr = ((unsigned long)(di + 1)) + name_len;
@@ -333,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			goto next_item;
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
-		item_size = btrfs_item_size_nr(leaf, slot);
+		item_size = btrfs_item_size(leaf, slot);
 		cur = 0;
 		while (cur < item_size) {
 			u16 name_len = btrfs_dir_name_len(leaf, di);
-- 
cgit v1.2.3-59-g8ed1b


From 5a08663d01c544bc1d0e552d060ccdda20e9103f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 21 Oct 2021 14:58:36 -0400
Subject: btrfs: remove the btrfs_item_end() helper

We're only using btrfs_item_end() from btrfs_item_end_nr(), so this can
be collapsed.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f4807b7e1d0..1d4feb98aa88 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1976,17 +1976,6 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
 	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
 
-static inline u32 btrfs_item_end(const struct extent_buffer *eb,
-				 struct btrfs_item *item)
-{
-	return btrfs_raw_item_offset(eb, item) + btrfs_raw_item_size(eb, item);
-}
-
-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
-{
-	return btrfs_item_end(eb, btrfs_item_nr(nr));
-}
-
 #define BTRFS_ITEM_SETGET_FUNCS(member)						\
 static inline u32 btrfs_item_##member(const struct extent_buffer *eb,		\
 				      int slot)					\
@@ -2014,6 +2003,11 @@ static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,
 BTRFS_ITEM_SETGET_FUNCS(offset)
 BTRFS_ITEM_SETGET_FUNCS(size);
 
+static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
+}
+
 static inline void btrfs_item_key(const struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
-- 
cgit v1.2.3-59-g8ed1b


From dc2e724e0fc070b63fad892389e047909a1de07b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 21 Oct 2021 14:58:37 -0400
Subject: btrfs: rename btrfs_item_end_nr to btrfs_item_data_end

The name btrfs_item_end_nr() is a bit of a misnomer, as it's actually
the offset of the end of the data the item points to.  In fact all of
the helpers that we use btrfs_item_end_nr() use data in their name, like
BTRFS_LEAF_DATA_SIZE() and leaf_data().  Rename to btrfs_item_data_end()
to make it clear what this helper is giving us.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c        | 10 +++++-----
 fs/btrfs/ctree.h        |  2 +-
 fs/btrfs/tree-checker.c |  8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 353af95c40c1..94b2be118b50 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2720,7 +2720,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
 	/* push left to right */
 	right_nritems = btrfs_header_nritems(right);
 
-	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+	push_space = btrfs_item_data_end(left, left_nritems - push_items);
 	push_space -= leaf_data_end(left);
 
 	/* make room in the right data area */
@@ -3120,7 +3120,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 
 	nritems = nritems - mid;
 	btrfs_set_header_nritems(right, nritems);
-	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
+	data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
 
 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
 			   btrfs_item_nr_offset(mid),
@@ -3131,7 +3131,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
 		     leaf_data_end(l), data_copy_size);
 
-	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
 
 	btrfs_init_map_token(&token, right);
 	for (i = 0; i < nritems; i++) {
@@ -3683,7 +3683,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 		BUG();
 	}
 	slot = path->slots[0];
-	old_data = btrfs_item_end_nr(leaf, slot);
+	old_data = btrfs_item_data_end(leaf, slot);
 
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
@@ -3770,7 +3770,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 
 	btrfs_init_map_token(&token, leaf);
 	if (slot != nritems) {
-		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+		unsigned int old_data = btrfs_item_data_end(leaf, slot);
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(leaf);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1d4feb98aa88..1f2a50c22b61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2003,7 +2003,7 @@ static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,
 BTRFS_ITEM_SETGET_FUNCS(offset)
 BTRFS_ITEM_SETGET_FUNCS(size);
 
-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
 {
 	return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
 }
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 09512d79e687..72e1c942197d 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1691,10 +1691,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 		else
 			item_end_expected = btrfs_item_offset(leaf,
 								 slot - 1);
-		if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
+		if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
 			generic_err(leaf, slot,
 				"unexpected item end, have %u expect %u",
-				btrfs_item_end_nr(leaf, slot),
+				btrfs_item_data_end(leaf, slot),
 				item_end_expected);
 			return -EUCLEAN;
 		}
@@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 		 * just in case all the items are consistent to each other, but
 		 * all point outside of the leaf.
 		 */
-		if (unlikely(btrfs_item_end_nr(leaf, slot) >
+		if (unlikely(btrfs_item_data_end(leaf, slot) >
 			     BTRFS_LEAF_DATA_SIZE(fs_info))) {
 			generic_err(leaf, slot,
 			"slot end outside of leaf, have %u expect range [0, %u]",
-				btrfs_item_end_nr(leaf, slot),
+				btrfs_item_data_end(leaf, slot),
 				BTRFS_LEAF_DATA_SIZE(fs_info));
 			return -EUCLEAN;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From eab67c06456850621146465d122e58d64e269cab Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 4 Nov 2021 17:00:12 -0700
Subject: btrfs: send: remove unused found_type parameter to
 lookup_dir_item_inode()

As far as I can tell, this was never used. No functional change.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6e2708917fc1..877276f34d72 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1688,8 +1688,7 @@ out:
  */
 static int lookup_dir_item_inode(struct btrfs_root *root,
 				 u64 dir, const char *name, int name_len,
-				 u64 *found_inode,
-				 u8 *found_type)
+				 u64 *found_inode)
 {
 	int ret = 0;
 	struct btrfs_dir_item *di;
@@ -1712,7 +1711,6 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
 		goto out;
 	}
 	*found_inode = key.objectid;
-	*found_type = btrfs_dir_type(path->nodes[0], di);
 
 out:
 	btrfs_free_path(path);
@@ -1835,7 +1833,6 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 	int ret = 0;
 	u64 gen;
 	u64 other_inode = 0;
-	u8 other_type = 0;
 
 	if (!sctx->parent_root)
 		goto out;
@@ -1863,7 +1860,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 	}
 
 	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
-			&other_inode, &other_type);
+				    &other_inode);
 	if (ret < 0 && ret != -ENOENT)
 		goto out;
 	if (ret) {
@@ -1908,7 +1905,6 @@ static int did_overwrite_ref(struct send_ctx *sctx,
 	int ret = 0;
 	u64 gen;
 	u64 ow_inode;
-	u8 other_type;
 
 	if (!sctx->parent_root)
 		goto out;
@@ -1932,7 +1928,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
 
 	/* check if the ref was overwritten by another ref */
 	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
-			&ow_inode, &other_type);
+				    &ow_inode);
 	if (ret < 0 && ret != -ENOENT)
 		goto out;
 	if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From b1dea4e7322d034c02a46d03a0da14ed1872fb6a Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 4 Nov 2021 17:00:13 -0700
Subject: btrfs: send: remove unused type parameter to iterate_inode_ref_t

Again, I don't think this was ever used since iterate_dir_item() is only
used for xattrs. No functional change.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 877276f34d72..6bdcb9d481d5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1002,7 +1002,7 @@ out:
 typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
 				  const char *name, int name_len,
 				  const char *data, int data_len,
-				  u8 type, void *ctx);
+				  void *ctx);
 
 /*
  * Helper function to iterate the entries in ONE btrfs_dir_item.
@@ -1027,7 +1027,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	u32 total;
 	int slot;
 	int num;
-	u8 type;
 
 	/*
 	 * Start with a small buffer (1 page). If later we end up needing more
@@ -1053,10 +1052,9 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	while (cur < total) {
 		name_len = btrfs_dir_name_len(eb, di);
 		data_len = btrfs_dir_data_len(eb, di);
-		type = btrfs_dir_type(eb, di);
 		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
 
-		if (type == BTRFS_FT_XATTR) {
+		if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
 			if (name_len > XATTR_NAME_MAX) {
 				ret = -ENAMETOOLONG;
 				goto out;
@@ -1106,7 +1104,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 		cur += len;
 
 		ret = iterate(num, &di_key, buf, name_len, buf + name_len,
-				data_len, type, ctx);
+			      data_len, ctx);
 		if (ret < 0)
 			goto out;
 		if (ret) {
@@ -4643,9 +4641,8 @@ out:
 }
 
 static int __process_new_xattr(int num, struct btrfs_key *di_key,
-			       const char *name, int name_len,
-			       const char *data, int data_len,
-			       u8 type, void *ctx)
+			       const char *name, int name_len, const char *data,
+			       int data_len, void *ctx)
 {
 	int ret;
 	struct send_ctx *sctx = ctx;
@@ -4689,8 +4686,7 @@ out:
 
 static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
 				   const char *name, int name_len,
-				   const char *data, int data_len,
-				   u8 type, void *ctx)
+				   const char *data, int data_len, void *ctx)
 {
 	int ret;
 	struct send_ctx *sctx = ctx;
@@ -4735,10 +4731,8 @@ struct find_xattr_ctx {
 	int found_data_len;
 };
 
-static int __find_xattr(int num, struct btrfs_key *di_key,
-			const char *name, int name_len,
-			const char *data, int data_len,
-			u8 type, void *vctx)
+static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
+			int name_len, const char *data, int data_len, void *vctx)
 {
 	struct find_xattr_ctx *ctx = vctx;
 
@@ -4788,7 +4782,7 @@ static int find_xattr(struct btrfs_root *root,
 static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
 				       const char *name, int name_len,
 				       const char *data, int data_len,
-				       u8 type, void *ctx)
+				       void *ctx)
 {
 	int ret;
 	struct send_ctx *sctx = ctx;
@@ -4800,12 +4794,12 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
 			 &found_data_len);
 	if (ret == -ENOENT) {
 		ret = __process_new_xattr(num, di_key, name, name_len, data,
-				data_len, type, ctx);
+					  data_len, ctx);
 	} else if (ret >= 0) {
 		if (data_len != found_data_len ||
 		    memcmp(data, found_data, data_len)) {
 			ret = __process_new_xattr(num, di_key, name, name_len,
-					data, data_len, type, ctx);
+						  data, data_len, ctx);
 		} else {
 			ret = 0;
 		}
@@ -4818,7 +4812,7 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
 static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
 					   const char *name, int name_len,
 					   const char *data, int data_len,
-					   u8 type, void *ctx)
+					   void *ctx)
 {
 	int ret;
 	struct send_ctx *sctx = ctx;
@@ -4827,7 +4821,7 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
 			 name, name_len, NULL, NULL);
 	if (ret == -ENOENT)
 		ret = __process_deleted_xattr(num, di_key, name, name_len, data,
-				data_len, type, ctx);
+					      data_len, ctx);
 	else if (ret >= 0)
 		ret = 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From fd8808097ad23ad8ae79f002b7b2a62e50f2c7eb Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 21 Sep 2021 12:33:23 +0800
Subject: btrfs: switch seeding_dev in init_new_device to bool

Declare int seeding_dev as a bool. Also, move its declaration a line
below to adjust packing.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 945cc6e72125..836e1b661fc9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2590,8 +2590,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	u64 orig_super_total_bytes;
 	u64 orig_super_num_devices;
-	int seeding_dev = 0;
 	int ret = 0;
+	bool seeding_dev = false;
 	bool locked = false;
 
 	if (sb_rdonly(sb) && !fs_devices->seeding)
@@ -2608,7 +2608,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	if (fs_devices->seeding) {
-		seeding_dev = 1;
+		seeding_dev = true;
 		down_write(&sb->s_umount);
 		mutex_lock(&uuid_mutex);
 		locked = true;
-- 
cgit v1.2.3-59-g8ed1b


From 849eae5e57a703105aa6cdce0d860ab95f44d81c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 9 Nov 2021 17:51:58 +0800
Subject: btrfs: consolidate device_list_mutex in prepare_sprout to its parent

btrfs_prepare_sprout() splices seed devices into its own struct fs_devices,
so that its parent function btrfs_init_new_device() can add the new sprout
device to fs_info->fs_devices.

Both btrfs_prepare_sprout() and btrfs_init_new_device() need
device_list_mutex. But they are holding it separately, thus create a
small race window. Close it and hold device_list_mutex across both
functions btrfs_init_new_device() and btrfs_prepare_sprout().

Split btrfs_prepare_sprout() into btrfs_init_sprout() and
btrfs_setup_sprout(). This split is essential because device_list_mutex
must not be held for allocations in btrfs_init_sprout() but must be held
for btrfs_setup_sprout(). So now a common device_list_mutex can be used
between btrfs_init_new_device() and btrfs_setup_sprout().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 69 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 836e1b661fc9..53753e04af14 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2431,21 +2431,15 @@ struct btrfs_device *btrfs_find_device_by_devspec(
 	return device;
 }
 
-/*
- * does all the dirty work required for changing file system's UUID.
- */
-static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
+static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
 	struct btrfs_fs_devices *seed_devices;
-	struct btrfs_super_block *disk_super = fs_info->super_copy;
-	struct btrfs_device *device;
-	u64 super_flags;
 
 	lockdep_assert_held(&uuid_mutex);
 	if (!fs_devices->seeding)
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 
 	/*
 	 * Private copy of the seed devices, anchored at
@@ -2453,7 +2447,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	 */
 	seed_devices = alloc_fs_devices(NULL, NULL);
 	if (IS_ERR(seed_devices))
-		return PTR_ERR(seed_devices);
+		return seed_devices;
 
 	/*
 	 * It's necessary to retain a copy of the original seed fs_devices in
@@ -2464,7 +2458,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	old_devices = clone_fs_devices(fs_devices);
 	if (IS_ERR(old_devices)) {
 		kfree(seed_devices);
-		return PTR_ERR(old_devices);
+		return old_devices;
 	}
 
 	list_add(&old_devices->fs_list, &fs_uuids);
@@ -2475,7 +2469,41 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&seed_devices->alloc_list);
 	mutex_init(&seed_devices->device_list_mutex);
 
-	mutex_lock(&fs_devices->device_list_mutex);
+	return seed_devices;
+}
+
+/*
+ * Splice seed devices into the sprout fs_devices.
+ * Generate a new fsid for the sprouted read-write filesystem.
+ */
+static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
+			       struct btrfs_fs_devices *seed_devices)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	struct btrfs_device *device;
+	u64 super_flags;
+
+	/*
+	 * We are updating the fsid, the thread leading to device_list_add()
+	 * could race, so uuid_mutex is needed.
+	 */
+	lockdep_assert_held(&uuid_mutex);
+
+	/*
+	 * The threads listed below may traverse dev_list but can do that without
+	 * device_list_mutex:
+	 * - All device ops and balance - as we are in btrfs_exclop_start.
+	 * - Various dev_list readers - are using RCU.
+	 * - btrfs_ioctl_fitrim() - is using RCU.
+	 *
+	 * For-read threads as below are using device_list_mutex:
+	 * - Readonly scrub btrfs_scrub_dev()
+	 * - Readonly scrub btrfs_scrub_progress()
+	 * - btrfs_get_dev_stats()
+	 */
+	lockdep_assert_held(&fs_devices->device_list_mutex);
+
 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
 			      synchronize_rcu);
 	list_for_each_entry(device, &seed_devices->devices, dev_list)
@@ -2491,13 +2519,10 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
-	mutex_unlock(&fs_devices->device_list_mutex);
 
 	super_flags = btrfs_super_flags(disk_super) &
 		      ~BTRFS_SUPER_FLAG_SEEDING;
 	btrfs_set_super_flags(disk_super, super_flags);
-
-	return 0;
 }
 
 /*
@@ -2588,6 +2613,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	struct super_block *sb = fs_info->sb;
 	struct rcu_string *name;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_fs_devices *seed_devices;
 	u64 orig_super_total_bytes;
 	u64 orig_super_num_devices;
 	int ret = 0;
@@ -2671,18 +2697,25 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	if (seeding_dev) {
 		btrfs_clear_sb_rdonly(sb);
-		ret = btrfs_prepare_sprout(fs_info);
-		if (ret) {
+
+		/* GFP_KERNEL allocation must not be under device_list_mutex */
+		seed_devices = btrfs_init_sprout(fs_info);
+		if (IS_ERR(seed_devices)) {
+			ret = PTR_ERR(seed_devices);
 			btrfs_abort_transaction(trans, ret);
 			goto error_trans;
 		}
+	}
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	if (seeding_dev) {
+		btrfs_setup_sprout(fs_info, seed_devices);
 		btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
 						device);
 	}
 
 	device->fs_devices = fs_devices;
 
-	mutex_lock(&fs_devices->device_list_mutex);
 	mutex_lock(&fs_info->chunk_mutex);
 	list_add_rcu(&device->dev_list, &fs_devices->devices);
 	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
@@ -2744,7 +2777,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 		/*
 		 * fs_devices now represents the newly sprouted filesystem and
-		 * its fsid has been changed by btrfs_prepare_sprout
+		 * its fsid has been changed by btrfs_sprout_splice().
 		 */
 		btrfs_sysfs_update_sprout_fsid(fs_devices);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From dfba78dc1c3bd6a61669970cfb61e766322c9231 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 10 Nov 2021 10:05:21 +0000
Subject: btrfs: reduce the scope of the tree log mutex during transaction
 commit

In the transaction commit path we are acquiring the tree log mutex too
early and we have a stale comment because:

1) It mentions a function named btrfs_commit_tree_roots(), which does not
   exists anymore, it was the old name of commit_cowonly_roots(), renamed
   a very long time ago by commit 5d4f98a28c7d33 ("Btrfs: Mixed back
   reference  (FORWARD ROLLING FORMAT CHANGE)"));

2) It mentions that we need to acquire the tree log mutex at that point
   to ensure we have no running log writers. That is not correct anymore,
   for many years at least, since we are guaranteed that we do not have
   any log writers at that point simply because we have set the state of
   the transaction to TRANS_STATE_COMMIT_DOING and have waited for all
   writers to complete - meaning no one can log until we change the state
   of the transaction to TRANS_STATE_UNBLOCKED. Any attempts to join the
   transaction or start a new one will block until we do that state
   transition;

3) The comment mentions a "trans mutex" which doesn't exists since 2011,
   commit a4abeea41adf ("Btrfs: kill trans_mutex") removed it;

4) The current use of the tree log mutex is to ensure proper serialization
   of super block writes - if someone started a new transaction and uses it
   for logging, it will wait for the previous transaction to write its
   super block before writing the super block when attempting to sync the
   log.

So acquire the tree log mutex only when it's absolutely needed, before
setting the transaction state to TRANS_STATE_UNBLOCKED, fix and move the
stale comment, add some assertions and new comments where appropriate.

Also, this has no effect on concurrency or performance, since the new
start of the critical section is still when the transaction is in the
state TRANS_STATE_COMMIT_DOING.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 76 ++++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1c3a1189c0bd..eb1c916d4884 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -162,6 +162,12 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
 	struct btrfs_root *root, *tmp;
 	struct btrfs_caching_control *caching_ctl, *next;
 
+	/*
+	 * At this point no one can be using this transaction to modify any tree
+	 * and no one can start another transaction to modify any tree either.
+	 */
+	ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
+
 	down_write(&fs_info->commit_root_sem);
 	list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
 				 dirty_list) {
@@ -1236,6 +1242,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
 	struct extent_buffer *eb;
 	int ret;
 
+	/*
+	 * At this point no one can be using this transaction to modify any tree
+	 * and no one can start another transaction to modify any tree either.
+	 */
+	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
 	eb = btrfs_lock_root_node(fs_info->tree_root);
 	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
 			      0, &eb, BTRFS_NESTING_COW);
@@ -1327,7 +1339,8 @@ void btrfs_add_dead_root(struct btrfs_root *root)
 }
 
 /*
- * update all the cowonly tree roots on disk
+ * Update each subvolume root and its relocation root, if it exists, in the tree
+ * of tree roots. Also free log roots if they exist.
  */
 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 {
@@ -1336,6 +1349,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 	int i;
 	int ret;
 
+	/*
+	 * At this point no one can be using this transaction to modify any tree
+	 * and no one can start another transaction to modify any tree either.
+	 */
+	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
 	spin_lock(&fs_info->fs_roots_radix_lock);
 	while (1) {
 		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
@@ -1348,6 +1367,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 			struct btrfs_root *root = gang[i];
 			int ret2;
 
+			/*
+			 * At this point we can neither have tasks logging inodes
+			 * from a root nor trying to commit a log tree.
+			 */
+			ASSERT(atomic_read(&root->log_writers) == 0);
+			ASSERT(atomic_read(&root->log_commit[0]) == 0);
+			ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
 			radix_tree_tag_clear(&fs_info->fs_roots_radix,
 					(unsigned long)root->root_key.objectid,
 					BTRFS_ROOT_TRANS_TAG);
@@ -1472,12 +1499,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	/*
-	 * We are going to commit transaction, see btrfs_commit_transaction()
-	 * comment for reason locking tree_log_mutex
-	 */
-	mutex_lock(&fs_info->tree_log_mutex);
-
 	ret = commit_fs_roots(trans);
 	if (ret)
 		goto out;
@@ -1513,8 +1534,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 			"Error while writing out transaction for qgroup");
 
 out:
-	mutex_unlock(&fs_info->tree_log_mutex);
-
 	/*
 	 * Force parent root to be updated, as we recorded it before so its
 	 * last_trans == cur_transid.
@@ -2246,24 +2265,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 	WARN_ON(cur_trans != trans->transaction);
 
-	/* btrfs_commit_tree_roots is responsible for getting the
-	 * various roots consistent with each other.  Every pointer
-	 * in the tree of tree roots has to point to the most up to date
-	 * root for every subvolume and other tree.  So, we have to keep
-	 * the tree logging code from jumping in and changing any
-	 * of the trees.
-	 *
-	 * At this point in the commit, there can't be any tree-log
-	 * writers, but a little lower down we drop the trans mutex
-	 * and let new people in.  By holding the tree_log_mutex
-	 * from now until after the super is written, we avoid races
-	 * with the tree-log code.
-	 */
-	mutex_lock(&fs_info->tree_log_mutex);
-
 	ret = commit_fs_roots(trans);
 	if (ret)
-		goto unlock_tree_log;
+		goto unlock_reloc;
 
 	/*
 	 * Since the transaction is done, we can apply the pending changes
@@ -2282,11 +2286,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 */
 	ret = btrfs_qgroup_account_extents(trans);
 	if (ret < 0)
-		goto unlock_tree_log;
+		goto unlock_reloc;
 
 	ret = commit_cowonly_roots(trans);
 	if (ret)
-		goto unlock_tree_log;
+		goto unlock_reloc;
 
 	/*
 	 * The tasks which save the space cache and inode cache may also
@@ -2294,7 +2298,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 */
 	if (TRANS_ABORTED(cur_trans)) {
 		ret = cur_trans->aborted;
-		goto unlock_tree_log;
+		goto unlock_reloc;
 	}
 
 	cur_trans = fs_info->running_transaction;
@@ -2327,6 +2331,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 	btrfs_trans_release_chunk_metadata(trans);
 
+	/*
+	 * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
+	 * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
+	 * make sure that before we commit our superblock, no other task can
+	 * start a new transaction and commit a log tree before we commit our
+	 * superblock. Anyone trying to commit a log tree locks this mutex before
+	 * writing its superblock.
+	 */
+	mutex_lock(&fs_info->tree_log_mutex);
+
 	spin_lock(&fs_info->trans_lock);
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	fs_info->running_transaction = NULL;
@@ -2339,10 +2353,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (ret) {
 		btrfs_handle_fs_error(fs_info, ret,
 				      "Error while writing out transaction");
-		/*
-		 * reloc_mutex has been unlocked, tree_log_mutex is still held
-		 * but we can't jump to unlock_tree_log causing double unlock
-		 */
 		mutex_unlock(&fs_info->tree_log_mutex);
 		goto scrub_continue;
 	}
@@ -2404,8 +2414,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 	return ret;
 
-unlock_tree_log:
-	mutex_unlock(&fs_info->tree_log_mutex);
 unlock_reloc:
 	mutex_unlock(&fs_info->reloc_mutex);
 scrub_continue:
-- 
cgit v1.2.3-59-g8ed1b


From d21deec5e7e6f9b02259cff171cbf1e53fb98429 Mon Sep 17 00:00:00 2001
From: Su Yue <l@damenly.su>
Date: Wed, 10 Nov 2021 14:42:17 +0800
Subject: btrfs: remove unused parameter fs_devices from btrfs_init_workqueues

Since commit ba8a9d079543 ("Btrfs: delete the entire async bio submission
framework") removed submit workqueues, the parameter fs_devices is not used
anymore.

Remove it, no functional changes.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Su Yue <l@damenly.su>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b3f2e2232326..6408948b3e2c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2291,8 +2291,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 	mutex_init(&fs_info->qgroup_rescan_lock);
 }
 
-static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
-		struct btrfs_fs_devices *fs_devices)
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 {
 	u32 max_active = fs_info->thread_pool_size;
 	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
@@ -3423,7 +3422,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		fs_info->subpage_info = subpage_info;
 	}
 
-	ret = btrfs_init_workqueues(fs_info, fs_devices);
+	ret = btrfs_init_workqueues(fs_info);
 	if (ret) {
 		err = ret;
 		goto fail_sb_buffer;
-- 
cgit v1.2.3-59-g8ed1b


From 16beac87e95e2fb278b552397c8260637f8a63f7 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 11 Nov 2021 14:14:38 +0900
Subject: btrfs: zoned: cache reported zone during mount

When mounting a device, we are reporting the zones twice: once for
checking the zone attributes in btrfs_get_dev_zone_info and once for
loading block groups' zone info in
btrfs_load_block_group_zone_info(). With a lot of block groups, that
leads to a lot of REPORT ZONE commands and slows down the mount
process.

This patch introduces a zone info cache in struct
btrfs_zoned_device_info. The cache is populated while in
btrfs_get_dev_zone_info() and used for
btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
commands. The zone cache is then released after loading the block
groups, as it will not be much effective during the run time.

Benchmark: Mount an HDD with 57,007 block groups
Before patch: 171.368 seconds
After patch: 64.064 seconds

While it still takes a minute due to the slowness of loading all the
block groups, the patch reduces the mount time by 1/3.

Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
CC: stable@vger.kernel.org
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/volumes.c     |  2 +-
 fs/btrfs/zoned.c       | 86 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/zoned.h       |  8 +++--
 5 files changed, 87 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 82769f1c17ee..66fa61cb3f23 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -322,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_info->fs_devices;
 
-	ret = btrfs_get_dev_zone_info(device);
+	ret = btrfs_get_dev_zone_info(device, false);
 	if (ret)
 		goto error;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6408948b3e2c..67533b13e1eb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3570,6 +3570,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		goto fail_sysfs;
 	}
 
+	btrfs_free_zone_cache(fs_info);
+
 	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
 	    !btrfs_check_rw_degradable(fs_info, NULL)) {
 		btrfs_warn(fs_info,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 53753e04af14..cafd490da072 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2669,7 +2669,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->fs_info = fs_info;
 	device->bdev = bdev;
 
-	ret = btrfs_get_dev_zone_info(device);
+	ret = btrfs_get_dev_zone_info(device, false);
 	if (ret)
 		goto error_free_device;
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 678a29469511..b06059a5db2a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -5,6 +5,7 @@
 #include <linux/blkdev.h>
 #include <linux/sched/mm.h>
 #include <linux/atomic.h>
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "zoned.h"
@@ -213,6 +214,8 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos,
 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 			       struct blk_zone *zones, unsigned int *nr_zones)
 {
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	u32 zno;
 	int ret;
 
 	if (!*nr_zones)
@@ -224,6 +227,34 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 		return 0;
 	}
 
+	/* Check cache */
+	if (zinfo->zone_cache) {
+		unsigned int i;
+
+		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+		zno = pos >> zinfo->zone_size_shift;
+		/*
+		 * We cannot report zones beyond the zone end. So, it is OK to
+		 * cap *nr_zones to at the end.
+		 */
+		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
+
+		for (i = 0; i < *nr_zones; i++) {
+			struct blk_zone *zone_info;
+
+			zone_info = &zinfo->zone_cache[zno + i];
+			if (!zone_info->len)
+				break;
+		}
+
+		if (i == *nr_zones) {
+			/* Cache hit on all the zones */
+			memcpy(zones, zinfo->zone_cache + zno,
+			       sizeof(*zinfo->zone_cache) * *nr_zones);
+			return 0;
+		}
+	}
+
 	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 				  copy_zone_info_cb, zones);
 	if (ret < 0) {
@@ -237,6 +268,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 	if (!ret)
 		return -EIO;
 
+	/* Populate cache */
+	if (zinfo->zone_cache)
+		memcpy(zinfo->zone_cache + zno, zones,
+		       sizeof(*zinfo->zone_cache) * *nr_zones);
+
 	return 0;
 }
 
@@ -300,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 		if (!device->bdev)
 			continue;
 
-		ret = btrfs_get_dev_zone_info(device);
+		ret = btrfs_get_dev_zone_info(device, true);
 		if (ret)
 			break;
 	}
@@ -309,7 +345,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
-int btrfs_get_dev_zone_info(struct btrfs_device *device)
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct btrfs_zoned_device_info *zone_info = NULL;
@@ -339,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 	if (!zone_info)
 		return -ENOMEM;
 
+	device->zone_info = zone_info;
+
 	if (!bdev_is_zoned(bdev)) {
 		if (!fs_info->zone_size) {
 			ret = calculate_emulated_zone_size(fs_info);
@@ -407,6 +445,23 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 		goto out;
 	}
 
+	/*
+	 * Enable zone cache only for a zoned device. On a non-zoned device, we
+	 * fill the zone info with emulated CONVENTIONAL zones, so no need to
+	 * use the cache.
+	 */
+	if (populate_cache && bdev_is_zoned(device->bdev)) {
+		zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
+						zone_info->nr_zones);
+		if (!zone_info->zone_cache) {
+			btrfs_err_in_rcu(device->fs_info,
+				"zoned: failed to allocate zone cache for %s",
+				rcu_str_deref(device->name));
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
 	/* Get zones type */
 	nactive = 0;
 	while (sector < nr_sectors) {
@@ -505,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 
 	kfree(zones);
 
-	device->zone_info = zone_info;
-
 	switch (bdev_zoned_model(bdev)) {
 	case BLK_ZONED_HM:
 		model = "host-managed zoned";
@@ -539,11 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 out:
 	kfree(zones);
 out_free_zone_info:
-	bitmap_free(zone_info->active_zones);
-	bitmap_free(zone_info->empty_zones);
-	bitmap_free(zone_info->seq_zones);
-	kfree(zone_info);
-	device->zone_info = NULL;
+	btrfs_destroy_dev_zone_info(device);
 
 	return ret;
 }
@@ -558,6 +607,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 	bitmap_free(zone_info->active_zones);
 	bitmap_free(zone_info->seq_zones);
 	bitmap_free(zone_info->empty_zones);
+	vfree(zone_info->zone_cache);
 	kfree(zone_info);
 	device->zone_info = NULL;
 }
@@ -1975,3 +2025,21 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
 		fs_info->data_reloc_bg = 0;
 	spin_unlock(&fs_info->relocation_bg_lock);
 }
+
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+
+	if (!btrfs_is_zoned(fs_info))
+		return;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (device->zone_info) {
+			vfree(device->zone_info->zone_cache);
+			device->zone_info->zone_cache = NULL;
+		}
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index e53ab7b96437..4344f4818389 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -28,6 +28,7 @@ struct btrfs_zoned_device_info {
 	unsigned long *seq_zones;
 	unsigned long *empty_zones;
 	unsigned long *active_zones;
+	struct blk_zone *zone_cache;
 	struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
 };
 
@@ -35,7 +36,7 @@ struct btrfs_zoned_device_info {
 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 		       struct blk_zone *zone);
 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
-int btrfs_get_dev_zone_info(struct btrfs_device *device);
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
 void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
@@ -76,6 +77,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
 			     u64 length);
 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 				     struct blk_zone *zone)
@@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i
 	return 0;
 }
 
-static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
+					  bool populate_cache)
 {
 	return 0;
 }
@@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
 
 static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
 
+static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
-- 
cgit v1.2.3-59-g8ed1b


From 9f35f76d7df66507b698ac7298fb6c175a6aae7d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 9 Nov 2021 10:12:01 -0500
Subject: btrfs: handle priority ticket failures in their respective helpers

Currently the error case for the priority tickets is handled where we
deal with all of the tickets, priority and non-priority.  This is OK in
general, but it makes for some awkward locking.  We take and drop the
space_info->lock back to back because of these different types of
tickets.

Rework the code to handle priority ticket failures in their respective
helpers.  This allows us to be less wonky with our space_info->lock
usage, and means that the main handler simply has to check
ticket->error, as the ticket is guaranteed to be off any list and
completely handled by the time it exits one of the handlers.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 48d77f360a24..9d6048f54097 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1260,7 +1260,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 				int states_nr)
 {
 	u64 to_reclaim;
-	int flush_state;
+	int flush_state = 0;
 
 	spin_lock(&space_info->lock);
 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
@@ -1268,10 +1268,9 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 		spin_unlock(&space_info->lock);
 		return;
 	}
-	spin_unlock(&space_info->lock);
 
-	flush_state = 0;
-	do {
+	while (flush_state < states_nr) {
+		spin_unlock(&space_info->lock);
 		flush_space(fs_info, space_info, to_reclaim, states[flush_state],
 			    false);
 		flush_state++;
@@ -1280,23 +1279,38 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 			spin_unlock(&space_info->lock);
 			return;
 		}
-		spin_unlock(&space_info->lock);
-	} while (flush_state < states_nr);
+	}
+
+	/*
+	 * We must run try_granting_tickets here because we could be a large
+	 * ticket in front of a smaller ticket that can now be satisfied with
+	 * the available space.
+	 */
+	ticket->error = -ENOSPC;
+	remove_ticket(space_info, ticket);
+	btrfs_try_granting_tickets(fs_info, space_info);
+	spin_unlock(&space_info->lock);
 }
 
 static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
 					struct btrfs_space_info *space_info,
 					struct reserve_ticket *ticket)
 {
+	spin_lock(&space_info->lock);
 	while (!space_info->full) {
+		spin_unlock(&space_info->lock);
 		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
 		spin_lock(&space_info->lock);
 		if (ticket->bytes == 0) {
 			spin_unlock(&space_info->lock);
 			return;
 		}
-		spin_unlock(&space_info->lock);
 	}
+
+	ticket->error = -ENOSPC;
+	remove_ticket(space_info, ticket);
+	btrfs_try_granting_tickets(fs_info, space_info);
+	spin_unlock(&space_info->lock);
 }
 
 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
@@ -1378,25 +1392,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
 		break;
 	}
 
-	spin_lock(&space_info->lock);
 	ret = ticket->error;
-	if (ticket->bytes || ticket->error) {
-		/*
-		 * We were a priority ticket, so we need to delete ourselves
-		 * from the list.  Because we could have other priority tickets
-		 * behind us that require less space, run
-		 * btrfs_try_granting_tickets() to see if their reservations can
-		 * now be made.
-		 */
-		if (!list_empty(&ticket->list)) {
-			remove_ticket(space_info, ticket);
-			btrfs_try_granting_tickets(fs_info, space_info);
-		}
-
-		if (!ret)
-			ret = -ENOSPC;
-	}
-	spin_unlock(&space_info->lock);
 	ASSERT(list_empty(&ticket->list));
 	/*
 	 * Check that we can't have an error set if the reservation succeeded,
-- 
cgit v1.2.3-59-g8ed1b


From 9cd8dcdc5e5c459bcd01a97c8c8d6bc2c8d73fdf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 9 Nov 2021 10:12:02 -0500
Subject: btrfs: check for priority ticket granting before flushing

Since we're dropping locks before we enter the priority flushing loops
we could have had our ticket granted before we got the space_info->lock.
So add this check to avoid doing some extra flushing in the priority
flushing cases.

The case in priority_reclaim_metadata_space is an optimization.  Think
we came in to reserve, we didn't have the space, we added our ticket to
the list.  But at the same time somebody was waiting on the space_info
lock to add space and do btrfs_try_granting_ticket(), so we drop the
lock, get satisfied, come in to do our loop, and we have been
satisfied.

This is the priority reclaim path, so to_reclaim could be !0 still
because we may have only satisfied the priority tickets and still left
non priority tickets on the list.  We would then have to_reclaim but
->bytes == 0.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add note about the optimization ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 9d6048f54097..dd6509795532 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1264,7 +1264,13 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 
 	spin_lock(&space_info->lock);
 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
-	if (!to_reclaim) {
+	/*
+	 * This is the priority reclaim path, so to_reclaim could be >0 still
+	 * because we may have only satisified the priority tickets and still
+	 * left non priority tickets on the list.  We would then have
+	 * to_reclaim but ->bytes == 0.
+	 */
+	if (ticket->bytes == 0) {
 		spin_unlock(&space_info->lock);
 		return;
 	}
@@ -1297,6 +1303,13 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
 					struct reserve_ticket *ticket)
 {
 	spin_lock(&space_info->lock);
+
+	/* We could have been granted before we got here. */
+	if (ticket->bytes == 0) {
+		spin_unlock(&space_info->lock);
+		return;
+	}
+
 	while (!space_info->full) {
 		spin_unlock(&space_info->lock);
 		flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
-- 
cgit v1.2.3-59-g8ed1b


From 1b0309eaa426242e168cf6a51dd707962d81578b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 9 Nov 2021 10:12:03 -0500
Subject: btrfs: check ticket->steal in steal_from_global_block_rsv

We're going to use this helper in the priority flushing loop, move this
check into the helper to simplify the logic.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index dd6509795532..20f6c0858410 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -844,6 +844,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	u64 min_bytes;
 
+	if (!ticket->steal)
+		return false;
+
 	if (global_rsv->space_info != space_info)
 		return false;
 
@@ -899,8 +902,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
 		ticket = list_first_entry(&space_info->tickets,
 					  struct reserve_ticket, list);
 
-		if (!aborted && ticket->steal &&
-		    steal_from_global_rsv(fs_info, space_info, ticket))
+		if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
 			return true;
 
 		if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
-- 
cgit v1.2.3-59-g8ed1b


From ee6adbfd6a2c15a71fb26d9321c97bef09ae0534 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 9 Nov 2021 10:12:04 -0500
Subject: btrfs: make BTRFS_RESERVE_FLUSH_EVICT use the global rsv stealing
 code

I forgot to convert this over when I introduced the global reserve
stealing code to the space flushing code.  Evict was simply trying to
make its reservation and then if it failed it would steal from the
global rsv, which is racey because it's outside of the normal ticketing
code.

Fix this by setting ticket->steal if we are BTRFS_RESERVE_FLUSH_EVICT,
and then make the priority flushing path do the steal for us.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c      | 15 ++++++---------
 fs/btrfs/space-info.c | 16 +++++++++++++---
 2 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e87c6cccf045..0b89704f7e21 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5523,7 +5523,6 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 							struct btrfs_block_rsv *rsv)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	struct btrfs_trans_handle *trans;
 	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
 	int ret;
@@ -5538,18 +5537,16 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 	 * above.  We reserve our extra bit here because we generate a ton of
 	 * delayed refs activity by truncating.
 	 *
-	 * If we cannot make our reservation we'll attempt to steal from the
-	 * global reserve, because we really want to be able to free up space.
+	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
+	 * if we fail to make this reservation we can re-try without the
+	 * delayed_refs_extra so we can make some forward progress.
 	 */
 	ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
 				     BTRFS_RESERVE_FLUSH_EVICT);
 	if (ret) {
-		/*
-		 * Try to steal from the global reserve if there is space for
-		 * it.
-		 */
-		if (btrfs_check_space_for_delayed_refs(fs_info) ||
-		    btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
+		ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+					     BTRFS_RESERVE_FLUSH_EVICT);
+		if (ret) {
 			btrfs_warn(fs_info,
 				   "could not allocate space for delete; will truncate on mount");
 			return ERR_PTR(-ENOSPC);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 20f6c0858410..a546cd411de4 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1289,13 +1289,17 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 		}
 	}
 
+	/* Attempt to steal from the global rsv if we can. */
+	if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
+		ticket->error = -ENOSPC;
+		remove_ticket(space_info, ticket);
+	}
+
 	/*
 	 * We must run try_granting_tickets here because we could be a large
 	 * ticket in front of a smaller ticket that can now be satisfied with
 	 * the available space.
 	 */
-	ticket->error = -ENOSPC;
-	remove_ticket(space_info, ticket);
 	btrfs_try_granting_tickets(fs_info, space_info);
 	spin_unlock(&space_info->lock);
 }
@@ -1449,6 +1453,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
 		space_info->clamp = min(space_info->clamp + 1, 8);
 }
 
+static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
+{
+	return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+		flush == BTRFS_RESERVE_FLUSH_EVICT);
+}
+
 /**
  * Try to reserve bytes from the block_rsv's space
  *
@@ -1522,7 +1532,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 		ticket.error = 0;
 		space_info->reclaim_size += ticket.bytes;
 		init_waitqueue_head(&ticket.wait);
-		ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+		ticket.steal = can_steal(flush);
 		if (trace_btrfs_reserve_ticket_enabled())
 			start_ns = ktime_get_ns();
 
-- 
cgit v1.2.3-59-g8ed1b


From 6dbdd578cd4fcb88ad9e57777a140e3349323e28 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 9 Nov 2021 10:12:05 -0500
Subject: btrfs: remove global rsv stealing logic for orphan cleanup

This is very old code before we were stealing from the global reserve
during evict.  We have proper ways to steal from the global reserve
while we're evicting, so rip out this code as it's no longer necessary.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index a546cd411de4..d277c7111b95 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1606,16 +1606,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
 				 enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	int ret;
 
 	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
-	if (ret == -ENOSPC &&
-	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
-		if (block_rsv != global_rsv &&
-		    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
-			ret = 0;
-	}
 	if (ret == -ENOSPC) {
 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
 					      block_rsv->space_info->flags,
-- 
cgit v1.2.3-59-g8ed1b


From 54230013d41fb6eb231755dfec8615db9dcfb020 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 9 Nov 2021 10:12:06 -0500
Subject: btrfs: get rid of root->orphan_cleanup_state

Now that we don't care about the stage of the orphan_cleanup_state,
simply replace it with a bit on ->state to make sure we don't call the
orphan cleanup every time we wander into this root.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 9 ++-------
 fs/btrfs/disk-io.c | 1 -
 fs/btrfs/inode.c   | 4 +---
 3 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1f2a50c22b61..004cf3f67ce5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -511,11 +511,6 @@ struct btrfs_discard_ctl {
 	atomic64_t discard_bytes_saved;
 };
 
-enum btrfs_orphan_cleanup_state {
-	ORPHAN_CLEANUP_STARTED	= 1,
-	ORPHAN_CLEANUP_DONE	= 2,
-};
-
 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
 
 /* fs_info */
@@ -1110,6 +1105,8 @@ enum {
 	BTRFS_ROOT_HAS_LOG_TREE,
 	/* Qgroup flushing is in progress */
 	BTRFS_ROOT_QGROUP_FLUSHING,
+	/* We started the orphan cleanup for this root. */
+	BTRFS_ROOT_ORPHAN_CLEANUP,
 };
 
 /*
@@ -1178,8 +1175,6 @@ struct btrfs_root {
 	spinlock_t log_extents_lock[2];
 	struct list_head logged_list[2];
 
-	int orphan_cleanup_state;
-
 	spinlock_t inode_lock;
 	/* red-black tree that keeps track of in-memory inodes */
 	struct rb_root inode_tree;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 67533b13e1eb..d94a1ca856aa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1144,7 +1144,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->node = NULL;
 	root->commit_root = NULL;
 	root->state = 0;
-	root->orphan_cleanup_state = 0;
 
 	root->last_trans = 0;
 	root->free_objectid = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0b89704f7e21..b4ef478e0de3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3475,7 +3475,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 	u64 last_objectid = 0;
 	int ret = 0, nr_unlink = 0;
 
-	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
+	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
 		return 0;
 
 	path = btrfs_alloc_path();
@@ -3633,8 +3633,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 	/* release the path since we're done with it */
 	btrfs_release_path(path);
 
-	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
-
 	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
 		trans = btrfs_join_transaction(root);
 		if (!IS_ERR(trans))
-- 
cgit v1.2.3-59-g8ed1b


From 9270501c163b67f48fd5fb2f98b1b48d6530f460 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 9 Nov 2021 10:12:07 -0500
Subject: btrfs: change root to fs_info for btrfs_reserve_metadata_bytes

We used to need the root for btrfs_reserve_metadata_bytes to check the
orphan cleanup state, but we no longer need that, we simply need the
fs_info.  Change btrfs_reserve_metadata_bytes() to use the fs_info, and
change both btrfs_block_rsv_refill() and btrfs_block_rsv_add() to do the
same as they simply call btrfs_reserve_metadata_bytes() and then
manipulate the block_rsv that is being used.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c    |  2 +-
 fs/btrfs/block-rsv.c      | 10 +++++-----
 fs/btrfs/block-rsv.h      |  4 ++--
 fs/btrfs/delalloc-space.c |  2 +-
 fs/btrfs/delayed-inode.c  |  2 +-
 fs/btrfs/delayed-ref.c    |  3 +--
 fs/btrfs/inode.c          |  4 ++--
 fs/btrfs/props.c          |  5 +++--
 fs/btrfs/relocation.c     | 17 +++++++++--------
 fs/btrfs/root-tree.c      |  2 +-
 fs/btrfs/space-info.c     |  3 +--
 fs/btrfs/space-info.h     |  2 +-
 fs/btrfs/transaction.c    |  4 ++--
 13 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6ab864655090..d56fc1b8bb99 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3790,7 +3790,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
 	}
 
 	if (!ret) {
-		ret = btrfs_block_rsv_add(fs_info->chunk_root,
+		ret = btrfs_block_rsv_add(fs_info,
 					  &fs_info->chunk_block_rsv,
 					  bytes, BTRFS_RESERVE_NO_FLUSH);
 		if (!ret)
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 04a6226e0388..9d162b5adfd8 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -208,7 +208,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 	kfree(rsv);
 }
 
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 			enum btrfs_reserve_flush_enum flush)
 {
@@ -217,7 +217,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
 	if (!ret)
 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
 
@@ -241,7 +241,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
 	return ret;
 }
 
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
 			   enum btrfs_reserve_flush_enum flush)
 {
@@ -262,7 +262,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
 	if (!ret) {
 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
 		return 0;
@@ -523,7 +523,7 @@ again:
 				block_rsv->type, ret);
 	}
 try_reserve:
-	ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
 					   BTRFS_RESERVE_NO_FLUSH);
 	if (!ret)
 		return block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 0b6ae5302837..07d61c2c5d28 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -57,11 +57,11 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
 				   unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 			enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
 			   enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 40c4d6ba3fb9..fb46a28f5065 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -334,7 +334,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
 	if (ret)
 		return ret;
-	ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
 	if (ret) {
 		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
 		return ret;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e164766dcc38..6f134f2c5e68 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -629,7 +629,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 					  BTRFS_QGROUP_RSV_META_PREALLOC, true);
 		if (ret < 0)
 			return ret;
-		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+		ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
 					  BTRFS_RESERVE_NO_FLUSH);
 		/* NO_FLUSH could only fail with -ENOSPC */
 		ASSERT(ret == 0 || ret == -ENOSPC);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cca7e85e32dd..da9d20813147 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -191,8 +191,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	if (!num_bytes)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
-					   num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
 	if (ret)
 		return ret;
 	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b4ef478e0de3..91f7ed27e421 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5539,10 +5539,10 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 	 * if we fail to make this reservation we can re-try without the
 	 * delayed_refs_extra so we can make some forward progress.
 	 */
-	ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
+	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
 				     BTRFS_RESERVE_FLUSH_EVICT);
 	if (ret) {
-		ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
 					     BTRFS_RESERVE_FLUSH_EVICT);
 		if (ret) {
 			btrfs_warn(fs_info,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index a978676aa627..1a6d2d5b4b33 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -377,8 +377,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
 		 */
 		if (need_reserve) {
 			num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
-			ret = btrfs_block_rsv_add(root, trans->block_rsv,
-					num_bytes, BTRFS_RESERVE_NO_FLUSH);
+			ret = btrfs_block_rsv_add(fs_info, trans->block_rsv,
+						  num_bytes,
+						  BTRFS_RESERVE_NO_FLUSH);
 			if (ret)
 				return ret;
 		}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ee0a0efc7efd..a455a1ead0d6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1736,7 +1736,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	memset(&next_key, 0, sizeof(next_key));
 
 	while (1) {
-		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+					     min_reserved,
 					     BTRFS_RESERVE_FLUSH_LIMIT);
 		if (ret)
 			goto out;
@@ -1855,7 +1856,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
-		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+		ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
 					  BTRFS_RESERVE_FLUSH_ALL);
 		if (ret)
 			err = ret;
@@ -2323,8 +2324,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
 	 * transaction and try to refill when we can flush all the things.
 	 */
-	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
-				BTRFS_RESERVE_FLUSH_LIMIT);
+	ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
+				     BTRFS_RESERVE_FLUSH_LIMIT);
 	if (ret) {
 		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
 		while (tmp <= rc->reserved_bytes)
@@ -3550,7 +3551,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	rc->reserved_bytes = 0;
 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
 			      RELOCATION_RESERVED_NODES;
-	ret = btrfs_block_rsv_refill(rc->extent_root,
+	ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
 				     rc->block_rsv, rc->block_rsv->size,
 				     BTRFS_RESERVE_FLUSH_ALL);
 	if (ret)
@@ -3598,9 +3599,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
 	while (1) {
 		rc->reserved_bytes = 0;
-		ret = btrfs_block_rsv_refill(rc->extent_root,
-					rc->block_rsv, rc->block_rsv->size,
-					BTRFS_RESERVE_FLUSH_ALL);
+		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+					     rc->block_rsv->size,
+					     BTRFS_RESERVE_FLUSH_ALL);
 		if (ret) {
 			err = ret;
 			break;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3297368aa359..3d68d2dcd83e 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -502,7 +502,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
 	rsv->space_info = btrfs_find_space_info(fs_info,
 					    BTRFS_BLOCK_GROUP_METADATA);
-	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+	ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes,
 				  BTRFS_RESERVE_FLUSH_ALL);
 
 	if (ret == -ENOSPC && use_global_rsv)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d277c7111b95..dc4b4b0372d1 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1600,12 +1600,11 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
  * regain reservations will be made and this will fail if there is not enough
  * space already.
  */
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 				 struct btrfs_block_rsv *block_rsv,
 				 u64 orig_bytes,
 				 enum btrfs_reserve_flush_enum flush)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 
 	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index cb5056472e79..d841fed73492 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -123,7 +123,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
 			   struct btrfs_space_info *info, u64 bytes,
 			   int dump_block_groups);
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 				 struct btrfs_block_rsv *block_rsv,
 				 u64 orig_bytes,
 				 enum btrfs_reserve_flush_enum flush);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index eb1c916d4884..8b6a90fafcd4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -634,7 +634,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 			reloc_reserved = true;
 		}
 
-		ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+		ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
 		if (ret)
 			goto reserve_fail;
 		if (delayed_refs_bytes) {
@@ -1597,7 +1597,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_reloc_pre_snapshot(pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		pending->error = btrfs_block_rsv_add(root,
+		pending->error = btrfs_block_rsv_add(fs_info,
 						     &pending->block_rsv,
 						     to_reserve,
 						     BTRFS_RESERVE_NO_FLUSH);
-- 
cgit v1.2.3-59-g8ed1b


From 83f1b68002c208329412cf9f998c90b3326828d2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 12 Nov 2021 13:33:14 +0800
Subject: btrfs: remove unnecessary @nr_written parameters

We use @nr_written to record how many pages have been started by
btrfs_run_delalloc_range().

Currently there are only two cases that would populate @nr_written:

- Inline extent creation
- Compressed write

But both cases will also set @page_started to one.

In fact, in writepage_delalloc() we have the following code, showing
that @nr_written is really only utilized for above two cases:

	/* did the fill delalloc function already unlock and start
	 * the IO?
	 */
	if (page_started) {
		/*
		 * we've unlocked the page, so we can't update
		 * the mapping's writeback index, just update
		 * nr_to_write.
		 */
		wbc->nr_to_write -= nr_written;
		return 1;
	}

But for such cases, writepage_delalloc() will return 1, and exit
__extent_writepage() without going through __extent_writepage_io().

Thus this means, inside __extent_writepage_io(), we always get
@nr_written as 0.

So this patch is going to remove the unnecessary parameter from the
following functions:

- writepage_delalloc()

  As @nr_written passed in is always the initial value 0.

  Although inside that function, we still need a local @nr_written
  to update wbc->nr_to_write.

- __extent_writepage_io()

  As explained above, @nr_written passed in can only be 0.

  This also means we can remove one update_nr_written() call.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9234d96a7fd5..dee4f043c7d8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3785,12 +3785,13 @@ static void update_nr_written(struct writeback_control *wbc,
  * This returns < 0 if there were errors (page still locked)
  */
 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
-		struct page *page, struct writeback_control *wbc,
-		unsigned long *nr_written)
+		struct page *page, struct writeback_control *wbc)
 {
 	const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
 	u64 delalloc_start = page_offset(page);
 	u64 delalloc_to_write = 0;
+	/* How many pages are started by btrfs_run_delalloc_range() */
+	unsigned long nr_written = 0;
 	int ret;
 	int page_started = 0;
 
@@ -3806,7 +3807,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 			continue;
 		}
 		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
-				delalloc_end, &page_started, nr_written, wbc);
+				delalloc_end, &page_started, &nr_written, wbc);
 		if (ret) {
 			btrfs_page_set_error(inode->root->fs_info, page,
 					     page_offset(page), PAGE_SIZE);
@@ -3829,16 +3830,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
 					 thresh);
 	}
 
-	/* did the fill delalloc function already unlock and start
-	 * the IO?
-	 */
+	/* Did btrfs_run_dealloc_range() already unlock and start the IO? */
 	if (page_started) {
 		/*
-		 * we've unlocked the page, so we can't update
-		 * the mapping's writeback index, just update
-		 * nr_to_write.
+		 * We've unlocked the page, so we can't update the mapping's
+		 * writeback index, just update nr_to_write.
 		 */
-		wbc->nr_to_write -= *nr_written;
+		wbc->nr_to_write -= nr_written;
 		return 1;
 	}
 
@@ -3910,7 +3908,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 				 struct writeback_control *wbc,
 				 struct extent_page_data *epd,
 				 loff_t i_size,
-				 unsigned long nr_written,
 				 int *nr_ret)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -3929,7 +3926,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 	if (ret) {
 		/* Fixup worker will requeue */
 		redirty_page_for_writepage(wbc, page);
-		update_nr_written(wbc, nr_written);
 		unlock_page(page);
 		return 1;
 	}
@@ -3938,7 +3934,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 	 * we don't want to touch the inode after unlocking the page,
 	 * so we update the mapping writeback index now
 	 */
-	update_nr_written(wbc, nr_written + 1);
+	update_nr_written(wbc, 1);
 
 	while (cur <= end) {
 		u64 disk_bytenr;
@@ -4076,7 +4072,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	size_t pg_offset;
 	loff_t i_size = i_size_read(inode);
 	unsigned long end_index = i_size >> PAGE_SHIFT;
-	unsigned long nr_written = 0;
 
 	trace___extent_writepage(page, inode, wbc);
 
@@ -4105,7 +4100,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	}
 
 	if (!epd->extent_locked) {
-		ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
+		ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
 		if (ret == 1)
 			return 0;
 		if (ret)
@@ -4113,7 +4108,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	}
 
 	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
-				    nr_written, &nr);
+				    &nr);
 	if (ret == 1)
 		return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 950575c023aabfeac506cae02917c32eae1f553e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 18 Nov 2021 16:33:14 -0500
Subject: btrfs: only use ->max_extent_size if it is set in the bitmap

While adding self tests for my space index change I was hitting a
problem where the space indexed tree wasn't returning the expected
->max_extent_size.  This is because we will skip searching any entry
that doesn't have ->bytes >= the amount of bytes we want.  However we'll
still set the max_extent_size based on that entry.  The problem is if we
don't search the bitmap we won't have ->max_extent_size set properly, so
we can't really trust it.

This doesn't really result in a problem per-se, it can just result in us
not finding contiguous area that may exist.  Fix the max_extent_size
helper to return ->bytes if ->max_extent_size isn't set, and add a big
comment explaining why we're doing this.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f3fee88c8ee0..543394acec44 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1870,9 +1870,33 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 	return -1;
 }
 
+/*
+ * This is a little subtle.  We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0.  In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes.  So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have.  For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
+ *  call into find_free_space(), and return with max_extent_size == 4k, because
+ *  that first bitmap entry had ->max_extent_size set, but the second one did
+ *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
+ *  8k contiguous range.
+ *
+ *  Consider the other case, we have 2 8k chunks in that second entry and still
+ *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
+ *  allocator comes in it'll fully search our second bitmap, and this time it'll
+ *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
+ *  right allocation the next loop through.
+ */
 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
 {
-	if (entry->bitmap)
+	if (entry->bitmap && entry->max_extent_size)
 		return entry->max_extent_size;
 	return entry->bytes;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 59c7b566a3b653fe7865cef007c053fd88de8317 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 18 Nov 2021 16:33:15 -0500
Subject: btrfs: index free space entries on size

Currently we index free space on offset only, because usually we have a
hint from the allocator that we want to honor for locality reasons.
However if we fail to use this hint we have to go back to a brute force
search through the free space entries to find a large enough extent.

With sufficiently fragmented free space this becomes quite expensive, as
we have to linearly search all of the free space entries to find if we
have a part that's long enough.

To fix this add a cached rb tree to index based on free space entry
bytes.  This will allow us to quickly look up the largest chunk in the
free space tree for this block group, and stop searching once we've
found an entry that is too small to satisfy our allocation.  We simply
choose to use this tree if we're searching from the beginning of the
block group, as we know we do not care about locality at that point.

I wrote an allocator test that creates a 10TiB ram backed null block
device and then fallocates random files until the file system is full.
I think go through and delete all of the odd files.  Then I spawn 8
threads that fallocate 64MiB files (1/2 our extent size cap) until the
file system is full again.  I use bcc's funclatency to measure the
latency of find_free_extent.  The baseline results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 10356    |****                                    |
       512 -> 1023       : 58242    |*************************               |
      1024 -> 2047       : 74418    |********************************        |
      2048 -> 4095       : 90393    |****************************************|
      4096 -> 8191       : 79119    |***********************************     |
      8192 -> 16383      : 35614    |***************                         |
     16384 -> 32767      : 13418    |*****                                   |
     32768 -> 65535      : 12811    |*****                                   |
     65536 -> 131071     : 17090    |*******                                 |
    131072 -> 262143     : 26465    |***********                             |
    262144 -> 524287     : 40179    |*****************                       |
    524288 -> 1048575    : 55469    |************************                |
   1048576 -> 2097151    : 48807    |*********************                   |
   2097152 -> 4194303    : 26744    |***********                             |
   4194304 -> 8388607    : 35351    |***************                         |
   8388608 -> 16777215   : 13918    |******                                  |
  16777216 -> 33554431   : 21       |                                        |

avg = 908079 nsecs, total: 580889071441 nsecs, count: 639690

And the patch results are

     nsecs               : count     distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 6883     |**                                      |
       512 -> 1023       : 54346    |*********************                   |
      1024 -> 2047       : 79170    |********************************        |
      2048 -> 4095       : 98890    |****************************************|
      4096 -> 8191       : 81911    |*********************************       |
      8192 -> 16383      : 27075    |**********                              |
     16384 -> 32767      : 14668    |*****                                   |
     32768 -> 65535      : 13251    |*****                                   |
     65536 -> 131071     : 15340    |******                                  |
    131072 -> 262143     : 26715    |**********                              |
    262144 -> 524287     : 43274    |*****************                       |
    524288 -> 1048575    : 53870    |*********************                   |
   1048576 -> 2097151    : 55368    |**********************                  |
   2097152 -> 4194303    : 41036    |****************                        |
   4194304 -> 8388607    : 24927    |**********                              |
   8388608 -> 16777215   : 33       |                                        |
  16777216 -> 33554431   : 9        |                                        |

avg = 623599 nsecs, total: 397259314759 nsecs, count: 637042

There's a little variation in the amount of calls done because of timing
of the threads with metadata requirements, but the avg, total, and
count's are relatively consistent between runs (usually within 2-5% of
each other).  As you can see here we have around a 30% decrease in
average latency with a 30% decrease in overall time spent in
find_free_extent.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 181 +++++++++++++++++++++++++++++++++-----------
 fs/btrfs/free-space-cache.h |   2 +
 2 files changed, 139 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 543394acec44..3920beb02131 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1580,6 +1580,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 	return 0;
 }
 
+/*
+ * This is a little subtle.  We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0.  In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes.  So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have.  For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
+ *  call into find_free_space(), and return with max_extent_size == 4K, because
+ *  that first bitmap entry had ->max_extent_size set, but the second one did
+ *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
+ *  8K contiguous range.
+ *
+ *  Consider the other case, we have 2 8K chunks in that second entry and still
+ *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
+ *  allocator comes in it'll fully search our second bitmap, and this time it'll
+ *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
+ *  right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
+{
+	if (entry->bitmap && entry->max_extent_size)
+		return entry->max_extent_size;
+	return entry->bytes;
+}
+
+/*
+ * We want the largest entry to be leftmost, so this is inverted from what you'd
+ * normally expect.
+ */
+static bool entry_less(struct rb_node *node, const struct rb_node *parent)
+{
+	const struct btrfs_free_space *entry, *exist;
+
+	entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+	exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
+	return get_max_extent_size(exist) < get_max_extent_size(entry);
+}
+
 /*
  * searches the tree for the given offset.
  *
@@ -1708,6 +1752,7 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
 	ctl->free_extents--;
 
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
@@ -1734,6 +1779,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	if (ret)
 		return ret;
 
+	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+
 	if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
 		ctl->discardable_extents[BTRFS_STAT_CURR]++;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1744,6 +1791,22 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 	return ret;
 }
 
+static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
+				struct btrfs_free_space *info)
+{
+	ASSERT(info->bitmap);
+
+	/*
+	 * If our entry is empty it's because we're on a cluster and we don't
+	 * want to re-link it into our ctl bytes index.
+	 */
+	if (RB_EMPTY_NODE(&info->bytes_index))
+		return;
+
+	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+}
+
 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 				       struct btrfs_free_space *info,
 				       u64 offset, u64 bytes)
@@ -1762,6 +1825,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 	if (info->max_extent_size > ctl->unit)
 		info->max_extent_size = 0;
 
+	relink_bitmap_entry(ctl, info);
+
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta++;
 
@@ -1797,9 +1862,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set(info->bitmap, start, count);
 
+	/*
+	 * We set some bytes, we have no idea what the max extent size is
+	 * anymore.
+	 */
+	info->max_extent_size = 0;
 	info->bytes += bytes;
 	ctl->free_space += bytes;
 
+	relink_bitmap_entry(ctl, info);
+
 	if (start && test_bit(start - 1, info->bitmap))
 		extent_delta--;
 
@@ -1867,44 +1939,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	*bytes = (u64)(max_bits) * ctl->unit;
 	bitmap_info->max_extent_size = *bytes;
+	relink_bitmap_entry(ctl, bitmap_info);
 	return -1;
 }
 
-/*
- * This is a little subtle.  We *only* have ->max_extent_size set if we actually
- * searched through the bitmap and figured out the largest ->max_extent_size,
- * otherwise it's 0.  In the case that it's 0 we don't want to tell the
- * allocator the wrong thing, we want to use the actual real max_extent_size
- * we've found already if it's larger, or we want to use ->bytes.
- *
- * This matters because find_free_space() will skip entries who's ->bytes is
- * less than the required bytes.  So if we didn't search down this bitmap, we
- * may pick some previous entry that has a smaller ->max_extent_size than we
- * have.  For example, assume we have two entries, one that has
- * ->max_extent_size set to 4k and ->bytes set to 1M.  A second entry hasn't set
- * ->max_extent_size yet, has ->bytes set to 8k and it's contiguous.  We will
- *  call into find_free_space(), and return with max_extent_size == 4k, because
- *  that first bitmap entry had ->max_extent_size set, but the second one did
- *  not.  If instead we returned 8k we'd come in searching for 8k, and find the
- *  8k contiguous range.
- *
- *  Consider the other case, we have 2 8k chunks in that second entry and still
- *  don't have ->max_extent_size set.  We'll return 16k, and the next time the
- *  allocator comes in it'll fully search our second bitmap, and this time it'll
- *  get an uptodate value of 8k as the maximum chunk size.  Then we'll get the
- *  right allocation the next loop through.
- */
-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
-{
-	if (entry->bitmap && entry->max_extent_size)
-		return entry->max_extent_size;
-	return entry->bytes;
-}
-
 /* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
-		unsigned long align, u64 *max_extent_size)
+		unsigned long align, u64 *max_extent_size, bool use_bytes_index)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
@@ -1914,16 +1956,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 
 	if (!ctl->free_space_offset.rb_node)
 		goto out;
+again:
+	if (use_bytes_index) {
+		node = rb_first_cached(&ctl->free_space_bytes);
+	} else {
+		entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+					   0, 1);
+		if (!entry)
+			goto out;
+		node = &entry->offset_index;
+	}
 
-	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
-	if (!entry)
-		goto out;
+	for (; node; node = rb_next(node)) {
+		if (use_bytes_index)
+			entry = rb_entry(node, struct btrfs_free_space,
+					 bytes_index);
+		else
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
 
-	for (node = &entry->offset_index; node; node = rb_next(node)) {
-		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		/*
+		 * If we are using the bytes index then all subsequent entries
+		 * in this tree are going to be < bytes, so simply set the max
+		 * extent size and exit the loop.
+		 *
+		 * If we're using the offset index then we need to keep going
+		 * through the rest of the tree.
+		 */
 		if (entry->bytes < *bytes) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
+			if (use_bytes_index)
+				break;
 			continue;
 		}
 
@@ -1940,6 +2004,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 			tmp = entry->offset;
 		}
 
+		/*
+		 * We don't break here if we're using the bytes index because we
+		 * may have another entry that has the correct alignment that is
+		 * the right size, so we don't want to miss that possibility.
+		 * At worst this adds another loop through the logic, but if we
+		 * broke here we could prematurely ENOSPC.
+		 */
 		if (entry->bytes < *bytes + align_off) {
 			*max_extent_size = max(get_max_extent_size(entry),
 					       *max_extent_size);
@@ -1947,6 +2018,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 		}
 
 		if (entry->bitmap) {
+			struct rb_node *old_next = rb_next(node);
 			u64 size = *bytes;
 
 			ret = search_bitmap(ctl, entry, &tmp, &size, true);
@@ -1959,6 +2031,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
 					max(get_max_extent_size(entry),
 					    *max_extent_size);
 			}
+
+			/*
+			 * The bitmap may have gotten re-arranged in the space
+			 * index here because the max_extent_size may have been
+			 * updated.  Start from the beginning again if this
+			 * happened.
+			 */
+			if (use_bytes_index && old_next != rb_next(node))
+				goto again;
 			continue;
 		}
 
@@ -2107,12 +2188,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 
 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
 
-	/*
-	 * We set some bytes, we have no idea what the max extent size is
-	 * anymore.
-	 */
-	info->max_extent_size = 0;
-
 	return bytes_to_set;
 
 }
@@ -2510,6 +2585,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
 	info->bytes = bytes;
 	info->trim_state = trim_state;
 	RB_CLEAR_NODE(&info->offset_index);
+	RB_CLEAR_NODE(&info->bytes_index);
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2823,6 +2899,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 	ctl->start = block_group->start;
 	ctl->private = block_group;
 	ctl->op = &free_space_op;
+	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
 	mutex_init(&ctl->cache_writeout_mutex);
 
@@ -2888,6 +2965,8 @@ static void __btrfs_return_cluster_to_free_space(
 		}
 		tree_insert_offset(&ctl->free_space_offset,
 				   entry->offset, &entry->offset_index, bitmap);
+		rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
+			      entry_less);
 	}
 	cluster->root = RB_ROOT;
 	spin_unlock(&cluster->lock);
@@ -2989,12 +3068,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 	u64 align_gap = 0;
 	u64 align_gap_len = 0;
 	enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+	bool use_bytes_index = (offset == block_group->start);
 
 	ASSERT(!btrfs_is_zoned(block_group->fs_info));
 
 	spin_lock(&ctl->tree_lock);
 	entry = find_free_space(ctl, &offset, &bytes_search,
-				block_group->full_stripe_len, max_extent_size);
+				block_group->full_stripe_len, max_extent_size,
+				use_bytes_index);
 	if (!entry)
 		goto out;
 
@@ -3278,6 +3359,17 @@ again:
 
 	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
+	rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+
+	/*
+	 * We need to know if we're currently on the normal space index when we
+	 * manipulate the bitmap so that we know we need to remove and re-insert
+	 * it into the space_index tree.  Clear the bytes_index node here so the
+	 * bitmap manipulation helpers know not to mess with the space_index
+	 * until this bitmap entry is added back into the normal cache.
+	 */
+	RB_CLEAR_NODE(&entry->bytes_index);
+
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
 	ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3368,6 +3460,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
+		rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
 		total_size += entry->bytes;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 1f23088d43f9..dd982d204d2d 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,6 +22,7 @@ enum btrfs_trim_state {
 
 struct btrfs_free_space {
 	struct rb_node offset_index;
+	struct rb_node bytes_index;
 	u64 offset;
 	u64 bytes;
 	u64 max_extent_size;
@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
 struct btrfs_free_space_ctl {
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
+	struct rb_root_cached free_space_bytes;
 	u64 free_space;
 	int extents_thresh;
 	int free_extents;
-- 
cgit v1.2.3-59-g8ed1b


From bbf27275f246a105a1e906f22a4f814f89bf4b55 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 18 Nov 2021 16:33:16 -0500
Subject: btrfs: add self test for bytes_index free space cache

I noticed a few corner cases when looking at my bytes_index patch for
obvious bugs, so add a bunch of tests to validate proper behavior of the
bytes_index tree.  A couple of basic tests to make sure it puts things
in the correct order, and then more complicated tests to make sure it
re-arranges bitmap entries properly and does the right thing when we try
to make allocations.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/free-space-tests.c | 181 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 8f05c1eb833f..87ae4aed3b06 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -824,6 +824,184 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
 	return 0;
 }
 
+static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_free_space *info)
+{
+	return true;
+}
+
+static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
+{
+	const struct btrfs_free_space_op test_free_space_ops = {
+		.use_bitmap = bytes_index_use_bitmap,
+	};
+	const struct btrfs_free_space_op *orig_free_space_ops;
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	u64 offset, max_extent_size, bytes;
+	int ret, i;
+
+	test_msg("running bytes index tests");
+
+	/* First just validate that it does everything in order. */
+	offset = 0;
+	for (i = 0; i < 10; i++) {
+		bytes = (i + 1) * SZ_1M;
+		ret = test_add_free_space_entry(cache, offset, bytes, 0);
+		if (ret) {
+			test_err("couldn't add extent entry %d\n", ret);
+			return ret;
+		}
+		offset += bytes + sectorsize;
+	}
+
+	for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node;
+	     node = rb_next(node), i--) {
+		entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+		bytes = (i + 1) * SZ_1M;
+		if (entry->bytes != bytes) {
+			test_err("invalid bytes index order, found %llu expected %llu",
+				 entry->bytes, bytes);
+			return -EINVAL;
+		}
+	}
+
+	/* Now validate bitmaps do the correct thing. */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	for (i = 0; i < 2; i++) {
+		offset = i * BITS_PER_BITMAP * sectorsize;
+		bytes = (i + 1) * SZ_1M;
+		ret = test_add_free_space_entry(cache, offset, bytes, 1);
+		if (ret) {
+			test_err("couldn't add bitmap entry");
+			return ret;
+		}
+	}
+
+	for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node;
+	     node = rb_next(node), i--) {
+		entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+		bytes = (i + 1) * SZ_1M;
+		if (entry->bytes != bytes) {
+			test_err("invalid bytes index order, found %llu expected %llu",
+				 entry->bytes, bytes);
+			return -EINVAL;
+		}
+	}
+
+	/* Now validate bitmaps with different ->max_extent_size. */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	orig_free_space_ops = cache->free_space_ctl->op;
+	cache->free_space_ctl->op = &test_free_space_ops;
+
+	ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
+	if (ret) {
+		test_err("couldn't add bitmap entry");
+		return ret;
+	}
+
+	offset = BITS_PER_BITMAP * sectorsize;
+	ret = test_add_free_space_entry(cache, offset, sectorsize, 1);
+	if (ret) {
+		test_err("couldn't add bitmap_entry");
+		return ret;
+	}
+
+	/*
+	 * Now set a bunch of sectorsize extents in the first entry so it's
+	 * ->bytes is large.
+	 */
+	for (i = 2; i < 20; i += 2) {
+		offset = sectorsize * i;
+		ret = btrfs_add_free_space(cache, offset, sectorsize);
+		if (ret) {
+			test_err("error populating sparse bitmap %d", ret);
+			return ret;
+		}
+	}
+
+	/*
+	 * Now set a contiguous extent in the second bitmap so its
+	 * ->max_extent_size is larger than the first bitmaps.
+	 */
+	offset = (BITS_PER_BITMAP * sectorsize) + sectorsize;
+	ret = btrfs_add_free_space(cache, offset, sectorsize);
+	if (ret) {
+		test_err("error adding contiguous extent %d", ret);
+		return ret;
+	}
+
+	/*
+	 * Since we don't set ->max_extent_size unless we search everything
+	 * should be indexed on bytes.
+	 */
+	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+			 struct btrfs_free_space, bytes_index);
+	if (entry->bytes != (10 * sectorsize)) {
+		test_err("error, wrong entry in the first slot in bytes_index");
+		return -EINVAL;
+	}
+
+	max_extent_size = 0;
+	offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3,
+					    0, &max_extent_size);
+	if (offset != 0) {
+		test_err("found space to alloc even though we don't have enough space");
+		return -EINVAL;
+	}
+
+	if (max_extent_size != (2 * sectorsize)) {
+		test_err("got the wrong max_extent size %llu expected %llu",
+			 max_extent_size, (unsigned long long)(2 * sectorsize));
+		return -EINVAL;
+	}
+
+	/*
+	 * The search should have re-arranged the bytes index to use the
+	 * ->max_extent_size, validate it's now what we expect it to be.
+	 */
+	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+			 struct btrfs_free_space, bytes_index);
+	if (entry->bytes != (2 * sectorsize)) {
+		test_err("error, the bytes index wasn't recalculated properly");
+		return -EINVAL;
+	}
+
+	/* Add another sectorsize to re-arrange the tree back to ->bytes. */
+	offset = (BITS_PER_BITMAP * sectorsize) - sectorsize;
+	ret = btrfs_add_free_space(cache, offset, sectorsize);
+	if (ret) {
+		test_err("error adding extent to the sparse entry %d", ret);
+		return ret;
+	}
+
+	entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+			 struct btrfs_free_space, bytes_index);
+	if (entry->bytes != (11 * sectorsize)) {
+		test_err("error, wrong entry in the first slot in bytes_index");
+		return -EINVAL;
+	}
+
+	/*
+	 * Now make sure we find our correct entry after searching that will
+	 * result in a re-arranging of the tree.
+	 */
+	max_extent_size = 0;
+	offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2,
+					    0, &max_extent_size);
+	if (offset != (BITS_PER_BITMAP * sectorsize)) {
+		test_err("error, found %llu instead of %llu for our alloc",
+			 offset,
+			 (unsigned long long)(BITS_PER_BITMAP * sectorsize));
+		return -EINVAL;
+	}
+
+	cache->free_space_ctl->op = orig_free_space_ops;
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	return 0;
+}
+
 int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 {
 	struct btrfs_fs_info *fs_info;
@@ -871,6 +1049,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 		goto out;
 
 	ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
+	if (ret)
+		goto out;
+	ret = test_bytes_index(cache, sectorsize);
 out:
 	btrfs_free_dummy_block_group(cache);
 	btrfs_free_dummy_root(root);
-- 
cgit v1.2.3-59-g8ed1b


From 167c0bd3775dc4ffedfd179c370994226d02a5c9 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 22 Nov 2021 17:16:46 +0200
Subject: btrfs: get next entry in tree_search_offset before doing checks

This is a small optimisation since the currently 'entry' is already
checked in the if () {} else if {} construct above the loop. In essence
the first iteration of the final while loop is redundant. To eliminate
this extra check simply get the next entry at the beginning of the loop.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3920beb02131..bbe0b36a452a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1730,6 +1730,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 		return NULL;
 
 	while (1) {
+		n = rb_next(&entry->offset_index);
+		if (!n)
+			return NULL;
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (entry->bitmap) {
 			if (entry->offset + BITS_PER_BITMAP *
 			    ctl->unit > offset)
@@ -1738,11 +1742,6 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 			if (entry->offset + entry->bytes > offset)
 				break;
 		}
-
-		n = rb_next(&entry->offset_index);
-		if (!n)
-			return NULL;
-		entry = rb_entry(n, struct btrfs_free_space, offset_index);
 	}
 	return entry;
 }
-- 
cgit v1.2.3-59-g8ed1b


From bf08387fb462ac7ab7dd4d3e4ac32728585bf625 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 19 Nov 2021 14:19:33 +0800
Subject: btrfs: don't check stripe length if the profile is not stripe based

[BUG]
When debugging calc_bio_boundaries(), I found that even for RAID1
metadata, we're following stripe length to calculate stripe boundary.

  # mkfs.btrfs -m raid1 -d raid1 /dev/test/scratch[12]
  # mount /dev/test/scratch /mnt/btrfs
  # xfs_io -f -c "pwrite 0 64K" /mnt/btrfs/file
  # umount

Above very basic operations will make calc_bio_boundaries() to report
the following result:

  submit_extent_page: r/i=1/1 file_offset=22036480 len_to_stripe_boundary=49152
  submit_extent_page: r/i=1/1 file_offset=30474240 len_to_stripe_boundary=65536
  ...
  submit_extent_page: r/i=1/1 file_offset=30523392 len_to_stripe_boundary=16384
  submit_extent_page: r/i=1/1 file_offset=30457856 len_to_stripe_boundary=16384
  submit_extent_page: r/i=5/257 file_offset=0 len_to_stripe_boundary=65536
  submit_extent_page: r/i=5/257 file_offset=65536 len_to_stripe_boundary=65536
  submit_extent_page: r/i=1/1 file_offset=30490624 len_to_stripe_boundary=49152
  submit_extent_page: r/i=1/1 file_offset=30507008 len_to_stripe_boundary=32768

Where "r/i" is the rootid and inode, 1/1 means they metadata.
The remaining names match the member used in kernel.

Even all data/metadata are using RAID1, we're still following stripe
length.

[CAUSE]
This behavior is caused by a wrong condition in btrfs_get_io_geometry():

	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
		/* Fill using stripe_len */
		len = min_t(u64, em->len - offset, max_len);
	} else {
		len = em->len - offset;
	}

This means, only for SINGLE we will not follow stripe_len.

However for profiles like RAID1*, DUP, they don't need to bother
stripe_len.

This can lead to unnecessary bio split for RAID1*/DUP profiles, and can
even be a blockage for future zoned RAID support.

[FIX]
Introduce one single-use macro, BTRFS_BLOCK_GROUP_STRIPE_MASK, and
change the condition to only calculate the length using stripe length
for stripe based profiles.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cafd490da072..f463dd5b8da3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -34,6 +34,10 @@
 #include "discard.h"
 #include "zoned.h"
 
+#define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
+					 BTRFS_BLOCK_GROUP_RAID10 | \
+					 BTRFS_BLOCK_GROUP_RAID56_MASK)
+
 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10] = {
 		.sub_stripes	= 2,
@@ -6347,7 +6351,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
 	stripe_offset = offset - stripe_offset;
 	data_stripes = nr_data_stripes(map);
 
-	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+	/* Only stripe based profiles needs to check against stripe length. */
+	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
 		u64 max_len = stripe_len - stripe_offset;
 
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From f1a8fc626586fcc62dd7eb44ebeddb4517784015 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 23 Nov 2021 09:23:42 +0200
Subject: btrfs: eliminate if in main loop in tree_search_offset

Reshuffle the code inside the first loop of tree_search_offset so that
one if() is eliminated and the becomes more linear.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bbe0b36a452a..a45017b12185 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1636,15 +1636,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 		   u64 offset, int bitmap_only, int fuzzy)
 {
 	struct rb_node *n = ctl->free_space_offset.rb_node;
-	struct btrfs_free_space *entry, *prev = NULL;
+	struct btrfs_free_space *entry = NULL, *prev = NULL;
 
 	/* find entry that is closest to the 'offset' */
-	while (1) {
-		if (!n) {
-			entry = NULL;
-			break;
-		}
-
+	while (n) {
 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
 		prev = entry;
 
@@ -1654,6 +1649,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 			n = n->rb_right;
 		else
 			break;
+
+		entry = NULL;
 	}
 
 	if (bitmap_only) {
-- 
cgit v1.2.3-59-g8ed1b


From 0af4769da6b29a5e091f3e404e8d5426bdeccac7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:27 -0400
Subject: btrfs: remove unused BTRFS_FS_BARRIER flag

This is no longer used, the -o nobarrier is handled by
BTRFS_MOUNT_NOBARRIER.  Remove the flag.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   | 1 -
 fs/btrfs/disk-io.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 004cf3f67ce5..f9e9f08175be 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -548,7 +548,6 @@ struct btrfs_swapfile_pin {
 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
 
 enum {
-	BTRFS_FS_BARRIER,
 	BTRFS_FS_CLOSING_START,
 	BTRFS_FS_CLOSING_DONE,
 	BTRFS_FS_LOG_RECOVERING,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d94a1ca856aa..de323878e85c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2956,7 +2956,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 
 	extent_io_tree_init(fs_info, &fs_info->excluded_extents,
 			    IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
-	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
 
 	mutex_init(&fs_info->ordered_operations_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
-- 
cgit v1.2.3-59-g8ed1b


From fdfbf020664b92336ac4f554ffec2d2f890960b9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:28 -0400
Subject: btrfs: rework async transaction committing

Currently we do this awful thing where we get another ref on a trans
handle, async off that handle and commit the transaction from that work.
Because we do this we have to mess with current->journal_info and the
freeze counting stuff.

We already have an async thing to kick for the transaction commit, the
transaction kthread.  Replace this work struct with a flag on the
fs_info to tell the kthread to go ahead and commit even if it's before
our timeout.  Then we can drastically simplify the async transaction
commit path.

Note: this can be simplified and functionality based on the pending
operation COMMIT.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add note ]
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/disk-io.c     |  3 ++-
 fs/btrfs/ioctl.c       |  7 +-----
 fs/btrfs/transaction.c | 64 +++++++++-----------------------------------------
 fs/btrfs/transaction.h |  2 +-
 5 files changed, 18 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9e9f08175be..6474e3094dae 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -595,6 +595,9 @@ enum {
 	/* Indicate whether there are any tree modification log users */
 	BTRFS_FS_TREE_MOD_LOG_USERS,
 
+	/* Indicate that we want the transaction kthread to commit right now. */
+	BTRFS_FS_COMMIT_TRANS,
+
 #if BITS_PER_LONG == 32
 	/* Indicate if we have error/warn message printed on 32bit systems */
 	BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index de323878e85c..9683b507be50 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1934,7 +1934,8 @@ static int transaction_kthread(void *arg)
 		}
 
 		delta = ktime_get_seconds() - cur->start_time;
-		if (cur->state < TRANS_STATE_COMMIT_START &&
+		if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
+		    cur->state < TRANS_STATE_COMMIT_START &&
 		    delta < fs_info->commit_interval) {
 			spin_unlock(&fs_info->trans_lock);
 			delay -= msecs_to_jiffies((delta - 1) * 1000);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 441d5130a4ec..af77098fdf35 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3622,7 +3622,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
 {
 	struct btrfs_trans_handle *trans;
 	u64 transid;
-	int ret;
 
 	trans = btrfs_attach_transaction_barrier(root);
 	if (IS_ERR(trans)) {
@@ -3634,11 +3633,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
 		goto out;
 	}
 	transid = trans->transid;
-	ret = btrfs_commit_transaction_async(trans);
-	if (ret) {
-		btrfs_end_transaction(trans);
-		return ret;
-	}
+	btrfs_commit_transaction_async(trans);
 out:
 	if (argp)
 		if (copy_to_user(argp, &transid, sizeof(transid)))
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8b6a90fafcd4..3538980a5dda 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1880,50 +1880,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 	return ret;
 }
 
-/*
- * commit transactions asynchronously. once btrfs_commit_transaction_async
- * returns, any subsequent transaction will not be allowed to join.
- */
-struct btrfs_async_commit {
-	struct btrfs_trans_handle *newtrans;
-	struct work_struct work;
-};
-
-static void do_async_commit(struct work_struct *work)
-{
-	struct btrfs_async_commit *ac =
-		container_of(work, struct btrfs_async_commit, work);
-
-	/*
-	 * We've got freeze protection passed with the transaction.
-	 * Tell lockdep about it.
-	 */
-	if (ac->newtrans->type & __TRANS_FREEZABLE)
-		__sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
-
-	current->journal_info = ac->newtrans;
-
-	btrfs_commit_transaction(ac->newtrans);
-	kfree(ac);
-}
-
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_async_commit *ac;
 	struct btrfs_transaction *cur_trans;
 
-	ac = kmalloc(sizeof(*ac), GFP_NOFS);
-	if (!ac)
-		return -ENOMEM;
-
-	INIT_WORK(&ac->work, do_async_commit);
-	ac->newtrans = btrfs_join_transaction(trans->root);
-	if (IS_ERR(ac->newtrans)) {
-		int err = PTR_ERR(ac->newtrans);
-		kfree(ac);
-		return err;
-	}
+	/* Kick the transaction kthread. */
+	set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+	wake_up_process(fs_info->transaction_kthread);
 
 	/* take transaction reference */
 	cur_trans = trans->transaction;
@@ -1931,14 +1895,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
 
 	btrfs_end_transaction(trans);
 
-	/*
-	 * Tell lockdep we've released the freeze rwsem, since the
-	 * async commit thread will be the one to unlock it.
-	 */
-	if (ac->newtrans->type & __TRANS_FREEZABLE)
-		__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
-
-	schedule_work(&ac->work);
 	/*
 	 * Wait for the current transaction commit to start and block
 	 * subsequent transaction joins
@@ -1946,14 +1902,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
 	wait_event(fs_info->transaction_blocked_wait,
 		   cur_trans->state >= TRANS_STATE_COMMIT_START ||
 		   TRANS_ABORTED(cur_trans));
-	if (current->journal_info == trans)
-		current->journal_info = NULL;
-
 	btrfs_put_transaction(cur_trans);
-	return 0;
 }
 
-
 static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2219,6 +2170,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	wait_event(cur_trans->writer_wait,
 		   atomic_read(&cur_trans->num_writers) == 1);
 
+	/*
+	 * We've started the commit, clear the flag in case we were triggered to
+	 * do an async commit but somebody else started before the transaction
+	 * kthread could do the work.
+	 */
+	clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+
 	if (TRANS_ABORTED(cur_trans)) {
 		ret = cur_trans->aborted;
 		goto scrub_continue;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ba45065f9451..e4b9b251a29e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -217,7 +217,7 @@ void btrfs_add_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root);
 int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
 bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
 void btrfs_throttle(struct btrfs_fs_info *fs_info);
-- 
cgit v1.2.3-59-g8ed1b


From 2e4e97abac4c95f8b87b2912ea013f7836a6f10b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:29 -0400
Subject: btrfs: pass fs_info to trace_btrfs_transaction_commit

The root on the trans->root can be anything, and generally we're
committing from the transaction kthread so it's usually the tree_root.
Change this to just take an fs_info, and to maintain compatibility
simply put the ROOT_TREE_OBJECTID as the root objectid for the
tracepoint.  This will allow use to remove trans->root.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c           |  2 +-
 fs/btrfs/transaction.c       |  4 ++--
 include/trace/events/btrfs.h | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9683b507be50..edf837ddbfea 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4989,7 +4989,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
 		spin_unlock(&fs_info->trans_lock);
 
 		btrfs_put_transaction(t);
-		trace_btrfs_transaction_commit(fs_info->tree_root);
+		trace_btrfs_transaction_commit(fs_info);
 		spin_lock(&fs_info->trans_lock);
 	}
 	spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3538980a5dda..0f1df04052a7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1956,7 +1956,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
 	btrfs_put_transaction(cur_trans);
 	btrfs_put_transaction(cur_trans);
 
-	trace_btrfs_transaction_commit(trans->root);
+	trace_btrfs_transaction_commit(fs_info);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -2361,7 +2361,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(fs_info->sb);
 
-	trace_btrfs_transaction_commit(trans->root);
+	trace_btrfs_transaction_commit(fs_info);
 
 	btrfs_scrub_continue(fs_info);
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8f58fd95efc7..0d729664b4b4 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -182,18 +182,18 @@ FLUSH_STATES
 
 TRACE_EVENT(btrfs_transaction_commit,
 
-	TP_PROTO(const struct btrfs_root *root),
+	TP_PROTO(const struct btrfs_fs_info *fs_info),
 
-	TP_ARGS(root),
+	TP_ARGS(fs_info),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  generation		)
 		__field(	u64,  root_objectid		)
 	),
 
-	TP_fast_assign_btrfs(root->fs_info,
-		__entry->generation	= root->fs_info->generation;
-		__entry->root_objectid	= root->root_key.objectid;
+	TP_fast_assign_btrfs(fs_info,
+		__entry->generation	= fs_info->generation;
+		__entry->root_objectid	= BTRFS_ROOT_TREE_OBJECTID;
 	),
 
 	TP_printk_btrfs("root=%llu(%s) gen=%llu",
-- 
cgit v1.2.3-59-g8ed1b


From 7a60751a33d9fdd1e17e77bd379aefe4ede44aa7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:30 -0400
Subject: btrfs: remove trans_handle->root

Nobody is using this anymore, remove it.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 1 -
 fs/btrfs/transaction.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0f1df04052a7..5b8f71aeccc9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -698,7 +698,6 @@ again:
 
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
-	h->root = root;
 	refcount_set(&h->use_count, 1);
 	h->fs_info = root->fs_info;
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e4b9b251a29e..1852ed9de7fd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -135,7 +135,6 @@ struct btrfs_trans_handle {
 	bool removing_chunk;
 	bool reloc_reserved;
 	bool in_fsync;
-	struct btrfs_root *root;
 	struct btrfs_fs_info *fs_info;
 	struct list_head new_bgs;
 };
-- 
cgit v1.2.3-59-g8ed1b


From 98cc42227a1b9b0fb6e89729d08f87f9356bb846 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:31 -0400
Subject: btrfs: pass the root to add_keyed_refs

We pass in the path, but use btrfs_next_item() using the root we
searched with.  Pass the root down to add_keyed_refs() instead of the
fs_info so we can continue to use the same root we searched with.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c4e0560d4c11..5667e282fd57 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1049,12 +1049,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
  *
  * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
  */
-static int add_keyed_refs(struct btrfs_fs_info *fs_info,
+static int add_keyed_refs(struct btrfs_root *extent_root,
 			  struct btrfs_path *path, u64 bytenr,
 			  int info_level, struct preftrees *preftrees,
 			  struct share_check *sc)
 {
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
 	int ret;
 	int slot;
 	struct extent_buffer *leaf;
@@ -1170,6 +1170,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct ulist *roots, const u64 *extent_item_pos,
 			     struct share_check *sc, bool ignore_offset)
 {
+	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1211,7 +1212,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 again:
 	head = NULL;
 
-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
@@ -1271,7 +1272,7 @@ again:
 					      &info_level, &preftrees, sc);
 			if (ret)
 				goto out;
-			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
+			ret = add_keyed_refs(root, path, bytenr, info_level,
 					     &preftrees, sc);
 			if (ret)
 				goto out;
-- 
cgit v1.2.3-59-g8ed1b


From 9665ebd5dba6a2c36cf67827ab074680c4fc2197 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:32 -0400
Subject: btrfs: move comment in find_parent_nodes()

This comment was much closer to the related code when it was originally
added, but has slowly migrated north far from its ancestral lands.  Move
it back down with its people.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 5667e282fd57..4d4046099e54 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1204,11 +1204,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	if (time_seq == BTRFS_SEQ_LAST)
 		path->skip_locking = 1;
 
-	/*
-	 * grab both a lock on the path and a lock on the delayed ref head.
-	 * We need both to get a consistent picture of how the refs look
-	 * at a specified point in time
-	 */
 again:
 	head = NULL;
 
@@ -1224,8 +1219,10 @@ again:
 	if (trans && time_seq != BTRFS_SEQ_LAST) {
 #endif
 		/*
-		 * look if there are updates for this ref queued and lock the
-		 * head
+		 * We have a specific time_seq we care about and trans which
+		 * means we have the path lock, we need to grab the ref head and
+		 * lock it so we have a consistent view of the refs at the given
+		 * time.
 		 */
 		delayed_refs = &trans->transaction->delayed_refs;
 		spin_lock(&delayed_refs->lock);
-- 
cgit v1.2.3-59-g8ed1b


From e0b7661d44dab027febaf37f6f7766a3007af932 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:33 -0400
Subject: btrfs: remove SANITY_TESTS check form find_parent_nodes

We define __TRANS_DUMMY always, so this extra ifdef stuff is not needed.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4d4046099e54..cebdb75ac6bb 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1212,12 +1212,8 @@ again:
 		goto out;
 	BUG_ON(ret == 0);
 
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	if (trans && likely(trans->type != __TRANS_DUMMY) &&
 	    time_seq != BTRFS_SEQ_LAST) {
-#else
-	if (trans && time_seq != BTRFS_SEQ_LAST) {
-#endif
 		/*
 		 * We have a specific time_seq we care about and trans which
 		 * means we have the path lock, we need to grab the ref head and
-- 
cgit v1.2.3-59-g8ed1b


From fcba0120edf88328524a4878d1d6f4ad39f2ec81 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:34 -0400
Subject: btrfs: remove BUG_ON() in find_parent_nodes()

We search for an extent entry with .offset = -1, which shouldn't be a
thing, but corruption happens.  Add an ASSERT() for the developers,
return -EUCLEAN for mortals.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index cebdb75ac6bb..01e1a0897195 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1210,7 +1210,12 @@ again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret == 0);
+	if (ret == 0) {
+		/* This shouldn't happen, indicates a bug or fs corruption. */
+		ASSERT(ret != 0);
+		ret = -EUCLEAN;
+		goto out;
+	}
 
 	if (trans && likely(trans->type != __TRANS_DUMMY) &&
 	    time_seq != BTRFS_SEQ_LAST) {
-- 
cgit v1.2.3-59-g8ed1b


From 9f05c09d6baef789726346397438cca4ec43c3ee Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:35 -0400
Subject: btrfs: remove BUG_ON(!eie) in find_parent_nodes

If we're looking for leafs that point to a data extent we want to record
the extent items that point at our bytenr.  At this point we have the
reference and we know for a fact that this leaf should have a reference
to our bytenr.  However if there's some sort of corruption we may not
find any references to our leaf, and thus could end up with eie == NULL.
Replace this BUG_ON() with an ASSERT() and then return -EUCLEAN for the
mortals.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 01e1a0897195..c6320c43319a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1359,10 +1359,18 @@ again:
 				goto out;
 			if (!ret && extent_item_pos) {
 				/*
-				 * we've recorded that parent, so we must extend
-				 * its inode list here
+				 * We've recorded that parent, so we must extend
+				 * its inode list here.
+				 *
+				 * However if there was corruption we may not
+				 * have found an eie, return an error in this
+				 * case.
 				 */
-				BUG_ON(!eie);
+				ASSERT(eie);
+				if (!eie) {
+					ret = -EUCLEAN;
+					goto out;
+				}
 				while (eie->next)
 					eie = eie->next;
 				eie->next = ref->inode_list;
-- 
cgit v1.2.3-59-g8ed1b


From dfe8aec4520bff8e69c8704421ef7f47c65ca778 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:36 -0400
Subject: btrfs: add a btrfs_block_group_root() helper

With extent tree v2 we will have a separate root to hold the block group
items.  Add a btrfs_block_group_root() that will return the appropriate
root given the flags of the fs, and convert all functions that need to
modify block group items to use the helper.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 19 ++++++++++---------
 fs/btrfs/disk-io.h     |  5 +++++
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index d56fc1b8bb99..d3d4200e2afc 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -841,7 +841,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ret;
 
-	root = fs_info->extent_root;
+	root = btrfs_block_group_root(fs_info);
 	key.objectid = block_group->start;
 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	key.offset = block_group->length;
@@ -1106,6 +1106,7 @@ out:
 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
 {
+	struct btrfs_root *root = btrfs_block_group_root(fs_info);
 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -1139,8 +1140,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 	num_items = 3 + map->num_stripes;
 	free_extent_map(em);
 
-	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
-							   num_items);
+	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
 }
 
 /*
@@ -1678,7 +1678,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
 				  struct btrfs_path *path,
 				  struct btrfs_key *key)
 {
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root = btrfs_block_group_root(fs_info);
 	int ret;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
@@ -2165,6 +2165,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
 
 int btrfs_read_block_groups(struct btrfs_fs_info *info)
 {
+	struct btrfs_root *root = btrfs_block_group_root(info);
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_block_group *cache;
@@ -2173,7 +2174,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	int need_clear = 0;
 	u64 cache_gen;
 
-	if (!info->extent_root)
+	if (!root)
 		return fill_dummy_bgs(info);
 
 	key.objectid = 0;
@@ -2276,7 +2277,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group_item bgi;
-	struct btrfs_root *root;
+	struct btrfs_root *root = btrfs_block_group_root(fs_info);
 	struct btrfs_key key;
 
 	spin_lock(&block_group->lock);
@@ -2289,7 +2290,6 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
 	key.offset = block_group->length;
 	spin_unlock(&block_group->lock);
 
-	root = fs_info->extent_root;
 	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
 }
 
@@ -2543,12 +2543,13 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_block_group_root(fs_info);
 	u64 alloc_flags;
 	int ret;
 	bool dirty_bg_running;
 
 	do {
-		trans = btrfs_join_transaction(fs_info->extent_root);
+		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 
@@ -2653,7 +2654,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int ret;
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root = btrfs_block_group_root(fs_info);
 	unsigned long bi;
 	struct extent_buffer *leaf;
 	struct btrfs_block_group_item bgi;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a2b5db4ba262..baca29523d35 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -103,6 +103,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 	return NULL;
 }
 
+static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+	return fs_info->extent_root;
+}
+
 void btrfs_put_root(struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
-- 
cgit v1.2.3-59-g8ed1b


From 76d76e789d1f251f66d3bc3bd53e582ca0625622 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:37 -0400
Subject: btrfs: make remove_extent_backref pass the root

With extent tree v2 we'll have a different extent root based on where
the bytenr is located, so adjust the remove_extent_backref() helper and
it's helpers to pass the extent_root around.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9dd7aaf7cd24..49bc6de979de 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -593,6 +593,7 @@ fail:
 }
 
 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
 					   struct btrfs_path *path,
 					   int refs_to_drop, int *last_ref)
 {
@@ -626,7 +627,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 	num_refs -= refs_to_drop;
 
 	if (num_refs == 0) {
-		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
+		ret = btrfs_del_item(trans, root, path);
 		*last_ref = 1;
 	} else {
 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
@@ -1174,6 +1175,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 }
 
 static int remove_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 struct btrfs_extent_inline_ref *iref,
 				 int refs_to_drop, int is_data, int *last_ref)
@@ -1185,11 +1187,11 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
 					     last_ref);
 	} else if (is_data) {
-		ret = remove_extent_data_ref(trans, path, refs_to_drop,
+		ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
 					     last_ref);
 	} else {
 		*last_ref = 1;
-		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
+		ret = btrfs_del_item(trans, root, path);
 	}
 	return ret;
 }
@@ -2996,9 +2998,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				goto err_dump;
 			}
 			/* Must be SHARED_* item, remove the backref first */
-			ret = remove_extent_backref(trans, path, NULL,
-						    refs_to_drop,
-						    is_data, &last_ref);
+			ret = remove_extent_backref(trans, extent_root, path,
+						    NULL, refs_to_drop, is_data,
+						    &last_ref);
 			if (ret) {
 				btrfs_abort_transaction(trans, ret);
 				goto out;
@@ -3122,8 +3124,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			btrfs_mark_buffer_dirty(leaf);
 		}
 		if (found_extent) {
-			ret = remove_extent_backref(trans, path, iref,
-						    refs_to_drop, is_data,
+			ret = remove_extent_backref(trans, extent_root, path,
+						    iref, refs_to_drop, is_data,
 						    &last_ref);
 			if (ret) {
 				btrfs_abort_transaction(trans, ret);
-- 
cgit v1.2.3-59-g8ed1b


From 8e1d02909185bddc76e98e680d7fea38be0e87da Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:38 -0400
Subject: btrfs: use chunk_root in find_free_extent_update_loop

We're only using this to start the transaction with to possibly allocate
a chunk.  It doesn't really matter which root to use, but with extent
tree v2 we'll need a bytenr to look up a extent root which makes the
usage of the extent_root awkward here.  Simply change it to the
chunk_root.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49bc6de979de..ae25ca54cbe8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3977,7 +3977,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 					struct find_free_extent_ctl *ffe_ctl,
 					bool full_search)
 {
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root = fs_info->chunk_root;
 	int ret;
 
 	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
-- 
cgit v1.2.3-59-g8ed1b


From 826582cabc22e741ef3d420fd53b83de988b5647 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:39 -0400
Subject: btrfs: do not special case the extent root for switch commit roots

This is a leftover from when we used to independently swap the extent
root's commit root and the fs tree commit roots.  At the time I simply
changed the helper to a list_add.  There's actually no reason to not add
the extent root to the switch commit root at this point, we don't care
about the order we do the switching since it's all done under the
commit_root_sem.

If we re-mark the extent root dirty after adding it to the
switch_commits list we'll see that BTRFS_ROOT_DIRTY isn't set and then
list_move it back onto the dirty list, and then we'll redo the tree
update and everything will be ok.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b8f71aeccc9..c9f85c412b23 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1278,9 +1278,8 @@ again:
 		root = list_entry(next, struct btrfs_root, dirty_list);
 		clear_bit(BTRFS_ROOT_DIRTY, &root->state);
 
-		if (root != fs_info->extent_root)
-			list_add_tail(&root->dirty_list,
-				      &trans->transaction->switch_commits);
+		list_add_tail(&root->dirty_list,
+			      &trans->transaction->switch_commits);
 		ret = update_cowonly_root(trans, root);
 		if (ret)
 			return ret;
@@ -1310,9 +1309,6 @@ again:
 	if (!list_empty(&fs_info->dirty_cowonly_roots))
 		goto again;
 
-	list_add_tail(&fs_info->extent_root->dirty_list,
-		      &trans->transaction->switch_commits);
-
 	/* Update dev-replace pointer once everything is committed */
 	fs_info->dev_replace.committed_cursor_left =
 		fs_info->dev_replace.cursor_left_last_write_of_item;
-- 
cgit v1.2.3-59-g8ed1b


From 3478c732520a01561bb49f8c5d469f02419fd922 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:40 -0400
Subject: btrfs: remove unnecessary extent root check in btrfs_defrag_leaves

We only defrag leaves on roots that have SHAREABLE set, so we don't need
to check if we're the extent root as it doesn't have SHAREABLE set.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-defrag.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 7c45d960b53c..b6cf39f4e7e4 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -27,14 +27,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int next_key_ret = 0;
 	u64 last_ret = 0;
 
-	if (root->fs_info->extent_root == root) {
-		/*
-		 * there's recursion here right now in the tree locking,
-		 * we can't defrag the extent root without deadlock
-		 */
-		goto out;
-	}
-
 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
 		goto out;
 
-- 
cgit v1.2.3-59-g8ed1b


From fd51eb2f07c7deb8af97ab699e6239e03aa7c5f6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:41 -0400
Subject: btrfs: don't use the extent root in btrfs_chunk_alloc_add_chunk_item

We're just using the extent_root to set the chunk owner to
root_key->objectid, which is BTRFS_EXTENT_TREE_OBJECTID, so use that
directly instead of using the root.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f463dd5b8da3..4f77268619f3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5541,7 +5541,6 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_block_group *bg)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_root *extent_root = fs_info->extent_root;
 	struct btrfs_root *chunk_root = fs_info->chunk_root;
 	struct btrfs_key key;
 	struct btrfs_chunk *chunk;
@@ -5613,7 +5612,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_set_stack_chunk_length(chunk, bg->length);
-	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
 	btrfs_set_stack_chunk_type(chunk, map->type);
 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
-- 
cgit v1.2.3-59-g8ed1b


From 30a9da5d8d494dda15d4b79a8596445140d3aac9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:42 -0400
Subject: btrfs: don't use extent_root in iterate_extent_inodes

We are going to have many extent_roots soon, and we don't need a root
here necessarily as we're not modifying anything, we're just getting the
trans handle so we can have an accurate view of references, so use the
tree_root here.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c6320c43319a..b51f8f90f226 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1969,7 +1969,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 			extent_item_objectid);
 
 	if (!search_commit_root) {
-		trans = btrfs_attach_transaction(fs_info->extent_root);
+		trans = btrfs_attach_transaction(fs_info->tree_root);
 		if (IS_ERR(trans)) {
 			if (PTR_ERR(trans) != -ENOENT &&
 			    PTR_ERR(trans) != -EROFS)
-- 
cgit v1.2.3-59-g8ed1b


From ce5603d015edc44faa0b46157f9ab7a9355a3e28 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:43 -0400
Subject: btrfs: don't use the extent_root in flush_space

We only need the root to start a transaction, and since it's a global
root we can pick anything, change to the tree_root as we'll have a lot
of extent roots in the future.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index dc4b4b0372d1..79fe0ad17acf 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -617,7 +617,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 		       struct btrfs_space_info *space_info, u64 num_bytes,
 		       enum btrfs_flush_state state, bool for_preempt)
 {
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_trans_handle *trans;
 	int nr;
 	int ret = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 2e608bd1dd51ee962bded788caee4aa812729e87 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:44 -0400
Subject: btrfs: init root block_rsv at init root time

In the future we're going to have multiple csum and extent root trees,
so init the roots block_rsv at setup_root time based on their root key
objectid.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c | 39 +++++++++++++++++++++++----------------
 fs/btrfs/block-rsv.h |  1 +
 fs/btrfs/disk-io.c   | 15 ++++++++-------
 3 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 9d162b5adfd8..454f3c629dd8 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -412,6 +412,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	spin_unlock(&sinfo->lock);
 }
 
+void btrfs_init_root_block_rsv(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	switch (root->root_key.objectid) {
+	case BTRFS_CSUM_TREE_OBJECTID:
+	case BTRFS_EXTENT_TREE_OBJECTID:
+		root->block_rsv = &fs_info->delayed_refs_rsv;
+		break;
+	case BTRFS_ROOT_TREE_OBJECTID:
+	case BTRFS_DEV_TREE_OBJECTID:
+	case BTRFS_QUOTA_TREE_OBJECTID:
+		root->block_rsv = &fs_info->global_block_rsv;
+		break;
+	case BTRFS_CHUNK_TREE_OBJECTID:
+		root->block_rsv = &fs_info->chunk_block_rsv;
+		break;
+	default:
+		root->block_rsv = NULL;
+		break;
+	}
+}
+
 void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_space_info *space_info;
@@ -426,22 +449,6 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->delayed_block_rsv.space_info = space_info;
 	fs_info->delayed_refs_rsv.space_info = space_info;
 
-	/*
-	 * Our various recovery options can leave us with NULL roots, so check
-	 * here and just bail before we go dereferencing NULLs everywhere.
-	 */
-	if (!fs_info->extent_root || !fs_info->csum_root ||
-	    !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
-		return;
-
-	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
-	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
-	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
-	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
-	if (fs_info->quota_root)
-		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
-	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-
 	btrfs_update_global_block_rsv(fs_info);
 }
 
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 07d61c2c5d28..3b67ff08d434 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -50,6 +50,7 @@ struct btrfs_block_rsv {
 };
 
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_root_block_rsv(struct btrfs_root *root);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 					      unsigned short type);
 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index edf837ddbfea..ff5ec8218501 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1140,7 +1140,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 			 u64 objectid)
 {
 	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+	memset(&root->root_key, 0, sizeof(root->root_key));
+	memset(&root->root_item, 0, sizeof(root->root_item));
+	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	root->fs_info = fs_info;
+	root->root_key.objectid = objectid;
 	root->node = NULL;
 	root->commit_root = NULL;
 	root->state = 0;
@@ -1151,7 +1156,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->nr_ordered_extents = 0;
 	root->inode_tree = RB_ROOT;
 	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
-	root->block_rsv = NULL;
+
+	btrfs_init_root_block_rsv(root);
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->root_list);
@@ -1189,6 +1195,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
+	root->anon_dev = 0;
 	if (!dummy) {
 		extent_io_tree_init(fs_info, &root->dirty_log_pages,
 				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
@@ -1196,12 +1203,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 				    IO_TREE_LOG_CSUM_RANGE, NULL);
 	}
 
-	memset(&root->root_key, 0, sizeof(root->root_key));
-	memset(&root->root_item, 0, sizeof(root->root_item));
-	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
-	root->root_key.objectid = objectid;
-	root->anon_dev = 0;
-
 	spin_lock_init(&root->root_item_lock);
 	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
 #ifdef CONFIG_BTRFS_DEBUG
-- 
cgit v1.2.3-59-g8ed1b


From 29cbcf401793f4e2c871c846edc2191731df2c41 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:45 -0400
Subject: btrfs: stop accessing ->extent_root directly

When we start having multiple extent roots we'll need to use a helper to
get to the correct extent_root.  Rename fs_info->extent_root to
_extent_root and convert all of the users of the extent root to using
the btrfs_extent_root() helper.  This will allow us to easily clean up
the remaining direct accesses in the future.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c                | 16 ++++++++++------
 fs/btrfs/block-group.c            |  3 ++-
 fs/btrfs/block-rsv.c              |  4 +++-
 fs/btrfs/ctree.h                  |  2 +-
 fs/btrfs/disk-io.c                | 15 ++++++++-------
 fs/btrfs/disk-io.h                |  8 +++++++-
 fs/btrfs/extent-tree.c            | 40 ++++++++++++++++++++++++---------------
 fs/btrfs/free-space-tree.c        |  3 ++-
 fs/btrfs/qgroup.c                 |  5 ++++-
 fs/btrfs/ref-verify.c             |  6 ++++--
 fs/btrfs/relocation.c             |  4 ++--
 fs/btrfs/scrub.c                  |  6 ++++--
 fs/btrfs/tests/free-space-tests.c |  2 +-
 fs/btrfs/tests/qgroup-tests.c     |  2 +-
 fs/btrfs/transaction.c            |  2 +-
 fs/btrfs/zoned.c                  |  3 ++-
 16 files changed, 77 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b51f8f90f226..c9ee579bc5a6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1170,7 +1170,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct ulist *roots, const u64 *extent_item_pos,
 			     struct share_check *sc, bool ignore_offset)
 {
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1747,6 +1747,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 			struct btrfs_path *path, struct btrfs_key *found_key,
 			u64 *flags_ret)
 {
+	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
 	int ret;
 	u64 flags;
 	u64 size = 0;
@@ -1762,11 +1763,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	key.objectid = logical;
 	key.offset = (u64)-1;
 
-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
 
-	ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+	ret = btrfs_previous_extent_item(extent_root, path, 0);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
@@ -2335,6 +2336,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(
 int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 {
 	struct btrfs_fs_info *fs_info = iter->fs_info;
+	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
 	struct btrfs_path *path = iter->path;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
@@ -2345,7 +2347,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 	key.offset = (u64)-1;
 	iter->bytenr = bytenr;
 
-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
 	if (ret == 0) {
@@ -2388,7 +2390,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 
 	/* If there is no inline backref, go search for keyed backref */
 	if (iter->cur_ptr >= iter->end_ptr) {
-		ret = btrfs_next_item(fs_info->extent_root, path);
+		ret = btrfs_next_item(extent_root, path);
 
 		/* No inline nor keyed ref */
 		if (ret > 0) {
@@ -2432,6 +2434,7 @@ release:
 int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
 {
 	struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+	struct btrfs_root *extent_root;
 	struct btrfs_path *path = iter->path;
 	struct btrfs_extent_inline_ref *iref;
 	int ret;
@@ -2462,7 +2465,8 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
 	}
 
 	/* We're at keyed items, there is no inline item, go to the next one */
-	ret = btrfs_next_item(iter->fs_info->extent_root, iter->path);
+	extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+	ret = btrfs_next_item(extent_root, iter->path);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index d3d4200e2afc..7d6372103aaf 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -514,7 +514,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 {
 	struct btrfs_block_group *block_group = caching_ctl->block_group;
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *extent_root;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
@@ -529,6 +529,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 		return -ENOMEM;
 
 	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+	extent_root = btrfs_extent_root(fs_info, last);
 
 #ifdef CONFIG_BTRFS_DEBUG
 	/*
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 454f3c629dd8..b216c4c7292a 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,6 +6,7 @@
 #include "space-info.h"
 #include "transaction.h"
 #include "block-group.h"
+#include "disk-io.h"
 
 /*
  * HOW DO BLOCK RESERVES WORK
@@ -351,6 +352,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 	struct btrfs_space_info *sinfo = block_rsv->space_info;
+	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, 0);
 	u64 num_bytes;
 	unsigned min_items;
 
@@ -359,7 +361,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	 * checksum tree and the root tree.  If the fs is empty we want to set
 	 * it to a minimal amount for safety.
 	 */
-	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
+	num_bytes = btrfs_root_used(&extent_root->root_item) +
 		btrfs_root_used(&fs_info->csum_root->root_item) +
 		btrfs_root_used(&fs_info->tree_root->root_item);
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6474e3094dae..aa4f0ad558d4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -621,7 +621,7 @@ enum btrfs_exclusive_operation {
 struct btrfs_fs_info {
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	unsigned long flags;
-	struct btrfs_root *extent_root;
+	struct btrfs_root *_extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ff5ec8218501..1b69f8e52939 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1557,7 +1557,7 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 	if (objectid == BTRFS_ROOT_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->tree_root);
 	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->extent_root);
+		return btrfs_grab_root(fs_info->_extent_root);
 	if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->chunk_root);
 	if (objectid == BTRFS_DEV_TREE_OBJECTID)
@@ -1630,7 +1630,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_free_ref_cache(fs_info);
 	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
-	btrfs_put_root(fs_info->extent_root);
+	btrfs_put_root(fs_info->_extent_root);
 	btrfs_put_root(fs_info->tree_root);
 	btrfs_put_root(fs_info->chunk_root);
 	btrfs_put_root(fs_info->dev_root);
@@ -2008,6 +2008,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 {
 	const int next_backup = info->backup_root_index;
 	struct btrfs_root_backup *root_backup;
+	struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
 
 	root_backup = info->super_for_commit->super_roots + next_backup;
 
@@ -2032,11 +2033,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	btrfs_set_backup_chunk_root_level(root_backup,
 			       btrfs_header_level(info->chunk_root->node));
 
-	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+	btrfs_set_backup_extent_root(root_backup, extent_root->node->start);
 	btrfs_set_backup_extent_root_gen(root_backup,
-			       btrfs_header_generation(info->extent_root->node));
+			       btrfs_header_generation(extent_root->node));
 	btrfs_set_backup_extent_root_level(root_backup,
-			       btrfs_header_level(info->extent_root->node));
+			       btrfs_header_level(extent_root->node));
 
 	/*
 	 * we might commit during log recovery, which happens before we set
@@ -2166,7 +2167,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->tree_root);
 
 	free_root_extent_buffers(info->dev_root);
-	free_root_extent_buffers(info->extent_root);
+	free_root_extent_buffers(info->_extent_root);
 	free_root_extent_buffers(info->csum_root);
 	free_root_extent_buffers(info->quota_root);
 	free_root_extent_buffers(info->uuid_root);
@@ -2456,7 +2457,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		}
 	} else {
 		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		fs_info->extent_root = root;
+		fs_info->_extent_root = root;
 	}
 
 	location.objectid = BTRFS_DEV_TREE_OBJECTID;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index baca29523d35..e2824c6ada72 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -103,9 +103,15 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 	return NULL;
 }
 
+static inline struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info,
+						   u64 bytenr)
+{
+	return fs_info->_extent_root;
+}
+
 static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
 {
-	return fs_info->extent_root;
+	return btrfs_extent_root(fs_info, 0);
 }
 
 void btrfs_put_root(struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ae25ca54cbe8..8e5dcb9928f5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -87,6 +87,7 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
 /* simple helper to search for an existing data extent at a given offset */
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 {
+	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -98,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 	key.objectid = start;
 	key.offset = len;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -116,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
 			     u64 offset, int metadata, u64 *refs, u64 *flags)
 {
+	struct btrfs_root *extent_root;
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_path *path;
@@ -153,7 +155,8 @@ search_again:
 	else
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 
-	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	extent_root = btrfs_extent_root(fs_info, bytenr);
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out_free;
 
@@ -443,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
 					   u64 root_objectid,
 					   u64 owner, u64 offset)
 {
-	struct btrfs_root *root = trans->fs_info->extent_root;
+	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 	struct btrfs_key key;
 	struct btrfs_extent_data_ref *ref;
 	struct extent_buffer *leaf;
@@ -519,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 					   u64 root_objectid, u64 owner,
 					   u64 offset, int refs_to_add)
 {
-	struct btrfs_root *root = trans->fs_info->extent_root;
+	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	u32 size;
@@ -686,7 +689,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
 					  u64 bytenr, u64 parent,
 					  u64 root_objectid)
 {
-	struct btrfs_root *root = trans->fs_info->extent_root;
+	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 	struct btrfs_key key;
 	int ret;
 
@@ -710,6 +713,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
 					  u64 bytenr, u64 parent,
 					  u64 root_objectid)
 {
+	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
 	struct btrfs_key key;
 	int ret;
 
@@ -722,8 +726,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
 		key.offset = root_objectid;
 	}
 
-	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
-				      path, &key, 0);
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
 	btrfs_release_path(path);
 	return ret;
 }
@@ -788,7 +791,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 				 u64 owner, u64 offset, int insert)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
@@ -1574,6 +1577,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 				 struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *root;
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct btrfs_extent_item *ei;
@@ -1603,8 +1607,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 		key.offset = head->num_bytes;
 	}
 
+	root = btrfs_extent_root(fs_info, key.objectid);
 again:
-	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
@@ -2287,7 +2292,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 					bool strict)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
 	struct extent_buffer *leaf;
 	struct btrfs_extent_data_ref *ref;
 	struct btrfs_extent_inline_ref *iref;
@@ -2922,7 +2927,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *info = trans->fs_info;
 	struct btrfs_key key;
 	struct btrfs_path *path;
-	struct btrfs_root *extent_root = info->extent_root;
+	struct btrfs_root *extent_root;
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
 	struct btrfs_extent_inline_ref *iref;
@@ -2938,6 +2943,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int last_ref = 0;
 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
 
+	extent_root = btrfs_extent_root(info, bytenr);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -4572,6 +4579,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      struct btrfs_key *ins, int ref_mod)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *extent_root;
 	int ret;
 	struct btrfs_extent_item *extent_item;
 	struct btrfs_extent_inline_ref *iref;
@@ -4591,8 +4599,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-				      ins, size);
+	extent_root = btrfs_extent_root(fs_info, ins->objectid);
+	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
 	if (ret) {
 		btrfs_free_path(path);
 		return ret;
@@ -4644,6 +4652,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *extent_root;
 	int ret;
 	struct btrfs_extent_item *extent_item;
 	struct btrfs_key extent_key;
@@ -4675,8 +4684,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-				      &extent_key, size);
+	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
+	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
+				      size);
 	if (ret) {
 		btrfs_free_path(path);
 		return ret;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 3abec44c6255..4e9bf940a566 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1046,7 +1046,7 @@ out:
 static int populate_free_space_tree(struct btrfs_trans_handle *trans,
 				    struct btrfs_block_group *block_group)
 {
-	struct btrfs_root *extent_root = trans->fs_info->extent_root;
+	struct btrfs_root *extent_root;
 	struct btrfs_path *path, *path2;
 	struct btrfs_key key;
 	u64 start, end;
@@ -1080,6 +1080,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.offset = 0;
 
+	extent_root = btrfs_extent_root(trans->fs_info, key.objectid);
 	ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
 	if (ret < 0)
 		goto out_locked;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 071f7334f818..8928275823a1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3161,6 +3161,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 			      struct btrfs_path *path)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *extent_root;
 	struct btrfs_key found;
 	struct extent_buffer *scratch_leaf = NULL;
 	struct ulist *roots = NULL;
@@ -3170,7 +3171,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 	int ret;
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
-	ret = btrfs_search_slot_for_read(fs_info->extent_root,
+	extent_root = btrfs_extent_root(fs_info,
+				fs_info->qgroup_rescan_progress.objectid);
+	ret = btrfs_search_slot_for_read(extent_root,
 					 &fs_info->qgroup_rescan_progress,
 					 path, 1, 0);
 
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index f34130d90dee..a248f46cfe72 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -972,6 +972,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
 /* Walk down all roots and build the ref tree, meant to be called at mount */
 int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_root *extent_root;
 	struct btrfs_path *path;
 	struct extent_buffer *eb;
 	int tree_block_level = 0;
@@ -985,7 +986,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 	if (!path)
 		return -ENOMEM;
 
-	eb = btrfs_read_lock_root_node(fs_info->extent_root);
+	extent_root = btrfs_extent_root(fs_info, 0);
+	eb = btrfs_read_lock_root_node(extent_root);
 	level = btrfs_header_level(eb);
 	path->nodes[level] = eb;
 	path->slots[level] = 0;
@@ -998,7 +1000,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 		 * would have had to added a ref key item which may appear on a
 		 * different leaf from the original extent item.
 		 */
-		ret = walk_down_tree(fs_info->extent_root, path, level,
+		ret = walk_down_tree(extent_root, path, level,
 				     &bytenr, &num_bytes, &tree_block_level);
 		if (ret)
 			break;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a455a1ead0d6..681fb2544a05 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3964,7 +3964,7 @@ static const char *stage_to_string(int stage)
 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
 {
 	struct btrfs_block_group *bg;
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
 	struct reloc_control *rc;
 	struct inode *inode;
 	struct btrfs_path *path;
@@ -4215,7 +4215,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 		goto out_end;
 	}
 
-	rc->extent_root = fs_info->extent_root;
+	rc->extent_root = btrfs_extent_root(fs_info, 0);
 
 	set_reloc_control(rc);
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d175c5ab1134..a0017351e558 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2897,7 +2897,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 						  u64 logic_end)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
 	struct btrfs_root *csum_root = fs_info->csum_root;
 	struct btrfs_extent_item *extent;
 	struct btrfs_io_context *bioc = NULL;
@@ -3168,7 +3168,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 {
 	struct btrfs_path *path, *ppath;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root;
 	struct btrfs_root *csum_root = fs_info->csum_root;
 	struct btrfs_extent_item *extent;
 	struct blk_plug plug;
@@ -3262,6 +3262,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		   atomic_read(&sctx->bios_in_flight) == 0);
 	scrub_blocked_if_needed(fs_info);
 
+	root = btrfs_extent_root(fs_info, logical);
+
 	/* FIXME it might be better to start readahead at commit root */
 	key.objectid = logical;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 87ae4aed3b06..13428dec04d3 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -1036,7 +1036,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	root->fs_info->extent_root = root;
+	root->fs_info->_extent_root = root;
 
 	ret = test_extents(cache);
 	if (ret)
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 19ba7d5b7d8f..88e19781e83f 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -455,7 +455,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	}
 
 	/* We are using this root as our extent root */
-	root->fs_info->extent_root = root;
+	root->fs_info->_extent_root = root;
 
 	/*
 	 * Some of the paths we test assume we have a filled out fs_info, so we
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c9f85c412b23..cc83ae624b4b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -419,7 +419,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
 
 	if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 	    root->last_trans < trans->transid) || force) {
-		WARN_ON(root == fs_info->extent_root);
+		WARN_ON(root == fs_info->_extent_root);
 		WARN_ON(!force && root->commit_root != root->node);
 
 		/*
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b06059a5db2a..5ef430f02fd2 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1154,7 +1154,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 				   u64 *offset_ret)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
-	struct btrfs_root *root = fs_info->extent_root;
+	struct btrfs_root *root;
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -1169,6 +1169,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 	key.type = 0;
 	key.offset = 0;
 
+	root = btrfs_extent_root(fs_info, key.objectid);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	/* We should not find the exact match */
 	if (!ret)
-- 
cgit v1.2.3-59-g8ed1b


From 84d2d6c7016513f59d98da30da486af3f5244b04 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:46 -0400
Subject: btrfs: fix csum assert to check objectid of the root

In the future we may have multiple csum roots, so simply check the
objectid is for a csum root instead of checking against ->csum_root.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file-item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 0f2e2ab34828..3acc8b2b5b84 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -801,7 +801,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 	const u32 csum_size = fs_info->csum_size;
 	u32 blocksize_bits = fs_info->sectorsize_bits;
 
-	ASSERT(root == fs_info->csum_root ||
+	ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
 	       root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 
 	path = btrfs_alloc_path();
-- 
cgit v1.2.3-59-g8ed1b


From 056c83111648a92233f0445bb4a6c1aeafe6be98 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:47 -0400
Subject: btrfs: set BTRFS_FS_STATE_NO_CSUMS if we fail to load the csum root

We have a few places where we skip doing csums if we mounted with one of
the rescue options that ignores bad csum roots.  In the future when
there are multiple csum roots it'll be costly to check and see if there
are any missing csum roots, so simply add a flag to indicate the fs
should skip loading csums in case of errors.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 3 ++-
 fs/btrfs/ctree.h       | 2 ++
 fs/btrfs/disk-io.c     | 5 +++++
 fs/btrfs/file-item.c   | 3 ++-
 fs/btrfs/inode.c       | 4 ++--
 5 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 32da97c3c19d..e776956d5bc9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -157,7 +157,8 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
 	struct compressed_bio *cb = bio->bi_private;
 	u8 *cb_sum = cb->sums;
 
-	if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
+	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
 		return 0;
 
 	shash->tfm = fs_info->csum_shash;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aa4f0ad558d4..1d4cdd1d153c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -143,6 +143,8 @@ enum {
 	BTRFS_FS_STATE_DEV_REPLACING,
 	/* The btrfs_fs_info created for self-tests */
 	BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+	BTRFS_FS_STATE_NO_CSUMS,
 };
 
 #define BTRFS_BACKREF_REV_MAX		256
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1b69f8e52939..19cea74a848f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2482,11 +2482,16 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
 				ret = PTR_ERR(root);
 				goto out;
+			} else {
+				set_bit(BTRFS_FS_STATE_NO_CSUMS,
+					&fs_info->fs_state);
 			}
 		} else {
 			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 			fs_info->csum_root = root;
 		}
+	} else {
+		set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
 	}
 
 	/*
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3acc8b2b5b84..2517b0050b99 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -376,7 +376,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
 	const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
 	int count = 0;
 
-	if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+	if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
 		return BLK_STS_OK;
 
 	/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 91f7ed27e421..2ea28f8a4a43 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2516,7 +2516,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
 	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
-		   !fs_info->csum_root;
+		test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
 
 	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
 		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
@@ -3314,7 +3314,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
-	if (!root->fs_info->csum_root)
+	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
 		return 0;
 
 	ASSERT(page_offset(page) <= start &&
-- 
cgit v1.2.3-59-g8ed1b


From fc28b25e1f428651133326300d9572b42aae6d8f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:48 -0400
Subject: btrfs: stop accessing ->csum_root directly

We are going to have multiple csum roots in the future, so convert all
users of ->csum_root to btrfs_csum_root() and rename ->csum_root to
->_csum_root so we can easily find remaining users in the future.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c   |  8 +++++---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/disk-io.c     | 15 ++++++++-------
 fs/btrfs/disk-io.h     |  6 ++++++
 fs/btrfs/extent-tree.c | 11 ++++++++---
 fs/btrfs/file-item.c   |  4 +++-
 fs/btrfs/inode.c       | 11 ++++++++---
 fs/btrfs/relocation.c  |  4 +++-
 fs/btrfs/scrub.c       |  7 +++++--
 fs/btrfs/tree-log.c    | 19 +++++++++++++------
 10 files changed, 60 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index b216c4c7292a..21ac60ec19f6 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -353,6 +353,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 	struct btrfs_space_info *sinfo = block_rsv->space_info;
 	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, 0);
+	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, 0);
 	u64 num_bytes;
 	unsigned min_items;
 
@@ -362,7 +363,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	 * it to a minimal amount for safety.
 	 */
 	num_bytes = btrfs_root_used(&extent_root->root_item) +
-		btrfs_root_used(&fs_info->csum_root->root_item) +
+		btrfs_root_used(&csum_root->root_item) +
 		btrfs_root_used(&fs_info->tree_root->root_item);
 
 	/*
@@ -476,8 +477,9 @@ static struct btrfs_block_rsv *get_block_rsv(
 	struct btrfs_block_rsv *block_rsv = NULL;
 
 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
-	    (root == fs_info->csum_root && trans->adding_csums) ||
-	    (root == fs_info->uuid_root))
+	    (root == fs_info->uuid_root) ||
+	    (trans->adding_csums &&
+	     root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
 		block_rsv = trans->block_rsv;
 
 	if (!block_rsv)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1d4cdd1d153c..5a6d53649eee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -628,7 +628,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
-	struct btrfs_root *csum_root;
+	struct btrfs_root *_csum_root;
 	struct btrfs_root *quota_root;
 	struct btrfs_root *uuid_root;
 	struct btrfs_root *free_space_root;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 19cea74a848f..4b4a8dae0b1c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1563,7 +1563,7 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 	if (objectid == BTRFS_DEV_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->dev_root);
 	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->csum_root);
+		return btrfs_grab_root(fs_info->_csum_root);
 	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->quota_root) ?
 			fs_info->quota_root : ERR_PTR(-ENOENT);
@@ -1634,7 +1634,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->tree_root);
 	btrfs_put_root(fs_info->chunk_root);
 	btrfs_put_root(fs_info->dev_root);
-	btrfs_put_root(fs_info->csum_root);
+	btrfs_put_root(fs_info->_csum_root);
 	btrfs_put_root(fs_info->quota_root);
 	btrfs_put_root(fs_info->uuid_root);
 	btrfs_put_root(fs_info->free_space_root);
@@ -2009,6 +2009,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	const int next_backup = info->backup_root_index;
 	struct btrfs_root_backup *root_backup;
 	struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+	struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
 
 	root_backup = info->super_for_commit->super_roots + next_backup;
 
@@ -2058,11 +2059,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	btrfs_set_backup_dev_root_level(root_backup,
 				       btrfs_header_level(info->dev_root->node));
 
-	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+	btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
 	btrfs_set_backup_csum_root_gen(root_backup,
-			       btrfs_header_generation(info->csum_root->node));
+				       btrfs_header_generation(csum_root->node));
 	btrfs_set_backup_csum_root_level(root_backup,
-			       btrfs_header_level(info->csum_root->node));
+					 btrfs_header_level(csum_root->node));
 
 	btrfs_set_backup_total_bytes(root_backup,
 			     btrfs_super_total_bytes(info->super_copy));
@@ -2168,7 +2169,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 
 	free_root_extent_buffers(info->dev_root);
 	free_root_extent_buffers(info->_extent_root);
-	free_root_extent_buffers(info->csum_root);
+	free_root_extent_buffers(info->_csum_root);
 	free_root_extent_buffers(info->quota_root);
 	free_root_extent_buffers(info->uuid_root);
 	free_root_extent_buffers(info->fs_root);
@@ -2488,7 +2489,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 			}
 		} else {
 			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-			fs_info->csum_root = root;
+			fs_info->_csum_root = root;
 		}
 	} else {
 		set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e2824c6ada72..a4d1788acd24 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -109,6 +109,12 @@ static inline struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info
 	return fs_info->_extent_root;
 }
 
+static inline struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info,
+						 u64 bytenr)
+{
+	return fs_info->_csum_root;
+}
+
 static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
 {
 	return btrfs_extent_root(fs_info, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8e5dcb9928f5..c17caa954dd5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1851,8 +1851,11 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	if (head->must_insert_reserved) {
 		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
 		if (head->is_data) {
-			ret = btrfs_del_csums(trans, fs_info->csum_root,
-					      head->bytenr, head->num_bytes);
+			struct btrfs_root *csum_root;
+
+			csum_root = btrfs_csum_root(fs_info, head->bytenr);
+			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
+					      head->num_bytes);
 		}
 	}
 
@@ -3188,7 +3191,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		btrfs_release_path(path);
 
 		if (is_data) {
-			ret = btrfs_del_csums(trans, info->csum_root, bytenr,
+			struct btrfs_root *csum_root;
+			csum_root = btrfs_csum_root(info, bytenr);
+			ret = btrfs_del_csums(trans, csum_root, bytenr,
 					      num_bytes);
 			if (ret) {
 				btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 2517b0050b99..90c5c38836ab 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -257,6 +257,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
 			    struct btrfs_path *path, u64 disk_bytenr,
 			    u64 len, u8 *dst)
 {
+	struct btrfs_root *csum_root;
 	struct btrfs_csum_item *item = NULL;
 	struct btrfs_key key;
 	const u32 sectorsize = fs_info->sectorsize;
@@ -285,7 +286,8 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
 
 	/* Current item doesn't contain the desired range, search again */
 	btrfs_release_path(path);
-	item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0);
+	csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+	item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0);
 	if (IS_ERR(item)) {
 		ret = PTR_ERR(item);
 		goto out;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2ea28f8a4a43..504cf090fc88 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1530,11 +1530,12 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
 					u64 bytenr, u64 num_bytes)
 {
-	int ret;
+	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
 	struct btrfs_ordered_sum *sums;
+	int ret;
 	LIST_HEAD(list);
 
-	ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
+	ret = btrfs_lookup_csums_range(csum_root, bytenr,
 				       bytenr + num_bytes - 1, &list, 0);
 	if (ret == 0 && list_empty(&list))
 		return 0;
@@ -2584,11 +2585,15 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct list_head *list)
 {
 	struct btrfs_ordered_sum *sum;
+	struct btrfs_root *csum_root = NULL;
 	int ret;
 
 	list_for_each_entry(sum, list, list) {
 		trans->adding_csums = true;
-		ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
+		if (!csum_root)
+			csum_root = btrfs_csum_root(trans->fs_info,
+						    sum->bytenr);
+		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
 		trans->adding_csums = false;
 		if (ret)
 			return ret;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 681fb2544a05..831c55708c88 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4306,6 +4306,7 @@ out:
 int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_root *csum_root;
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_ordered_extent *ordered;
 	int ret;
@@ -4317,7 +4318,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
 	BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
 
 	disk_bytenr = file_pos + inode->index_cnt;
-	ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
+	csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+	ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
 				       disk_bytenr + len - 1, &list, 0);
 	if (ret)
 		goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a0017351e558..15a123e67108 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2898,7 +2898,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
-	struct btrfs_root *csum_root = fs_info->csum_root;
+	struct btrfs_root *csum_root;
 	struct btrfs_extent_item *extent;
 	struct btrfs_io_context *bioc = NULL;
 	u64 flags;
@@ -3060,6 +3060,7 @@ again:
 			extent_dev = bioc->stripes[0].dev;
 			btrfs_put_bioc(bioc);
 
+			csum_root = btrfs_csum_root(fs_info, extent_logical);
 			ret = btrfs_lookup_csums_range(csum_root,
 						extent_logical,
 						extent_logical + extent_len - 1,
@@ -3169,7 +3170,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	struct btrfs_path *path, *ppath;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_root *root;
-	struct btrfs_root *csum_root = fs_info->csum_root;
+	struct btrfs_root *csum_root;
 	struct btrfs_extent_item *extent;
 	struct blk_plug plug;
 	u64 flags;
@@ -3273,6 +3274,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	key_end.offset = (u64)-1;
 	reada1 = btrfs_reada_add(root, &key, &key_end);
 
+	csum_root = btrfs_csum_root(fs_info, logical);
+
 	if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
 		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 		key.type = BTRFS_EXTENT_CSUM_KEY;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9165486b554e..7c4634ac511e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -872,17 +872,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			 */
 			while (!list_empty(&ordered_sums)) {
 				struct btrfs_ordered_sum *sums;
+				struct btrfs_root *csum_root;
+
 				sums = list_entry(ordered_sums.next,
 						struct btrfs_ordered_sum,
 						list);
+				csum_root = btrfs_csum_root(fs_info,
+							    sums->bytenr);
 				if (!ret)
-					ret = btrfs_del_csums(trans,
-							      fs_info->csum_root,
+					ret = btrfs_del_csums(trans, csum_root,
 							      sums->bytenr,
 							      sums->len);
 				if (!ret)
 					ret = btrfs_csum_file_blocks(trans,
-						fs_info->csum_root, sums);
+								     csum_root,
+								     sums);
 				list_del(&sums->list);
 				kfree(sums);
 			}
@@ -4340,6 +4344,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 			found_type = btrfs_file_extent_type(src, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				struct btrfs_root *csum_root;
 				u64 ds, dl, cs, cl;
 				ds = btrfs_file_extent_disk_bytenr(src,
 								extent);
@@ -4358,8 +4363,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 					cl = dl;
 				}
 
-				ret = btrfs_lookup_csums_range(
-						fs_info->csum_root,
+				csum_root = btrfs_csum_root(fs_info, ds);
+				ret = btrfs_lookup_csums_range(csum_root,
 						ds + cs, ds + cs + cl - 1,
 						&ordered_sums, 0);
 				if (ret)
@@ -4411,6 +4416,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 			    struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *csum_root;
 	u64 csum_offset;
 	u64 csum_len;
 	u64 mod_start = em->mod_start;
@@ -4491,7 +4497,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 	}
 
 	/* block start is already adjusted for the file extent offset. */
-	ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
+	csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
+	ret = btrfs_lookup_csums_range(csum_root,
 				       em->block_start + csum_offset,
 				       em->block_start + csum_offset +
 				       csum_len - 1, &ordered_sums, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 7939dd9f35f6f76f5ee199851bcb4d89cd7be061 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:49 -0400
Subject: btrfs: stop accessing ->free_space_root directly

We're going to have multiple free space roots in the future, so adjust
all the users of the free space root to use a helper to access the root.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h                       |  2 +-
 fs/btrfs/disk-io.c                     | 10 +++++-----
 fs/btrfs/free-space-tree.c             | 32 +++++++++++++++++++-------------
 fs/btrfs/tests/free-space-tree-tests.c |  2 +-
 4 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5a6d53649eee..e1511fbaf851 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -631,7 +631,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *_csum_root;
 	struct btrfs_root *quota_root;
 	struct btrfs_root *uuid_root;
-	struct btrfs_root *free_space_root;
+	struct btrfs_root *_free_space_root;
 	struct btrfs_root *data_reloc_root;
 
 	/* the log root tree is a directory of all the other log roots */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b4a8dae0b1c..066df9869bb1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1571,8 +1571,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 		return btrfs_grab_root(fs_info->uuid_root) ?
 			fs_info->uuid_root : ERR_PTR(-ENOENT);
 	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->free_space_root) ?
-			fs_info->free_space_root : ERR_PTR(-ENOENT);
+		return btrfs_grab_root(fs_info->_free_space_root) ?
+			fs_info->_free_space_root : ERR_PTR(-ENOENT);
 	return NULL;
 }
 
@@ -1637,7 +1637,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->_csum_root);
 	btrfs_put_root(fs_info->quota_root);
 	btrfs_put_root(fs_info->uuid_root);
-	btrfs_put_root(fs_info->free_space_root);
+	btrfs_put_root(fs_info->_free_space_root);
 	btrfs_put_root(fs_info->fs_root);
 	btrfs_put_root(fs_info->data_reloc_root);
 	btrfs_check_leaked_roots(fs_info);
@@ -2176,7 +2176,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->data_reloc_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
-	free_root_extent_buffers(info->free_space_root);
+	free_root_extent_buffers(info->_free_space_root);
 }
 
 void btrfs_put_root(struct btrfs_root *root)
@@ -2542,7 +2542,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 			}
 		}  else {
 			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-			fs_info->free_space_root = root;
+			fs_info->_free_space_root = root;
 		}
 	}
 
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 4e9bf940a566..51b7cb165e9b 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -16,6 +16,12 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 					struct btrfs_block_group *block_group,
 					struct btrfs_path *path);
 
+static struct btrfs_root *btrfs_free_space_root(
+				struct btrfs_block_group *block_group)
+{
+	return block_group->fs_info->_free_space_root;
+}
+
 void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
 {
 	u32 bitmap_range;
@@ -51,7 +57,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
 				   struct btrfs_block_group *block_group,
 				   struct btrfs_path *path)
 {
-	struct btrfs_root *root = trans->fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_free_space_info *info;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -85,7 +91,7 @@ struct btrfs_free_space_info *search_free_space_info(
 		struct btrfs_path *path, int cow)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_key key;
 	int ret;
 
@@ -188,7 +194,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 				  struct btrfs_path *path)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_free_space_info *info;
 	struct btrfs_key key, found_key;
 	struct extent_buffer *leaf;
@@ -326,7 +332,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 				  struct btrfs_path *path)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_free_space_info *info;
 	struct btrfs_key key, found_key;
 	struct extent_buffer *leaf;
@@ -586,7 +592,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
 				    u64 start, u64 size, int remove)
 {
-	struct btrfs_root *root = block_group->fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_key key;
 	u64 end = start + size;
 	u64 cur_start, cur_size;
@@ -699,7 +705,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
 				    u64 start, u64 size)
 {
-	struct btrfs_root *root = trans->fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_key key;
 	u64 found_start, found_end;
 	u64 end = start + size;
@@ -851,7 +857,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path,
 				 u64 start, u64 size)
 {
-	struct btrfs_root *root = trans->fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_key key, new_key;
 	u64 found_start, found_end;
 	u64 end = start + size;
@@ -1158,7 +1164,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 		ret = PTR_ERR(free_space_root);
 		goto abort;
 	}
-	fs_info->free_space_root = free_space_root;
+	fs_info->_free_space_root = free_space_root;
 
 	node = rb_first(&fs_info->block_group_cache_tree);
 	while (node) {
@@ -1233,7 +1239,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *tree_root = fs_info->tree_root;
-	struct btrfs_root *free_space_root = fs_info->free_space_root;
+	struct btrfs_root *free_space_root = fs_info->_free_space_root;
 	int ret;
 
 	trans = btrfs_start_transaction(tree_root, 0);
@@ -1242,7 +1248,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
 
 	btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
 	btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
-	fs_info->free_space_root = NULL;
+	fs_info->_free_space_root = NULL;
 
 	ret = clear_free_space_tree(trans, free_space_root);
 	if (ret)
@@ -1320,7 +1326,7 @@ out:
 int remove_block_group_free_space(struct btrfs_trans_handle *trans,
 				  struct btrfs_block_group *block_group)
 {
-	struct btrfs_root *root = trans->fs_info->free_space_root;
+	struct btrfs_root *root = btrfs_free_space_root(block_group);
 	struct btrfs_path *path;
 	struct btrfs_key key, found_key;
 	struct extent_buffer *leaf;
@@ -1411,7 +1417,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
 
 	block_group = caching_ctl->block_group;
 	fs_info = block_group->fs_info;
-	root = fs_info->free_space_root;
+	root = btrfs_free_space_root(block_group);
 
 	end = block_group->start + block_group->length;
 
@@ -1489,7 +1495,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
 
 	block_group = caching_ctl->block_group;
 	fs_info = block_group->fs_info;
-	root = fs_info->free_space_root;
+	root = btrfs_free_space_root(block_group);
 
 	end = block_group->start + block_group->length;
 
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 2c783d2f5228..7d6de8b53038 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -446,7 +446,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 
 	btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
 					BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
-	root->fs_info->free_space_root = root;
+	root->fs_info->_free_space_root = root;
 	root->fs_info->tree_root = root;
 
 	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
-- 
cgit v1.2.3-59-g8ed1b


From 7fcf8a0050df003776d10602c9c52b57212f3345 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:50 -0400
Subject: btrfs: remove useless WARN_ON in record_root_in_trans

We don't set SHAREABLE on the extent root, we don't need to have this
safety check here.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cc83ae624b4b..5eadb17b971b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -419,7 +419,6 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
 
 	if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 	    root->last_trans < trans->transid) || force) {
-		WARN_ON(root == fs_info->_extent_root);
 		WARN_ON(!force && root->commit_root != root->node);
 
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From abed4aaae4f71a7bcdbe90a65319b6e772a2689d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 Nov 2021 16:45:51 -0400
Subject: btrfs: track the csum, extent, and free space trees in a rb tree

In the future we are going to have multiple copies of these trees.  To
facilitate this we need a way to lookup the different roots we are
looking for.  Handle this by adding a global root rb tree that is
indexed on the root->root_key.  Then instead of loading the roots at
mount time with individually targeted keys, simply search the tree_root
for anything with the specific objectid we want.  This will make it
straightforward to support both old style and new style file systems.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h                       |   9 +-
 fs/btrfs/disk-io.c                     | 276 ++++++++++++++++++++++++++-------
 fs/btrfs/disk-io.h                     |  18 +--
 fs/btrfs/extent-tree.c                 |   1 +
 fs/btrfs/free-space-tree.c             |  23 ++-
 fs/btrfs/tests/btrfs-tests.c           |   1 +
 fs/btrfs/tests/free-space-tests.c      |   5 +-
 fs/btrfs/tests/free-space-tree-tests.c |   5 +-
 fs/btrfs/tests/qgroup-tests.c          |   5 +-
 9 files changed, 263 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e1511fbaf851..aa303669615d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -623,20 +623,21 @@ enum btrfs_exclusive_operation {
 struct btrfs_fs_info {
 	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	unsigned long flags;
-	struct btrfs_root *_extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
-	struct btrfs_root *_csum_root;
 	struct btrfs_root *quota_root;
 	struct btrfs_root *uuid_root;
-	struct btrfs_root *_free_space_root;
 	struct btrfs_root *data_reloc_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
 
+	/* The tree that holds the global roots (csum, extent, etc) */
+	rwlock_t global_root_lock;
+	struct rb_root global_root_tree;
+
 	spinlock_t fs_roots_radix_lock;
 	struct radix_tree_root fs_roots_radix;
 
@@ -1129,6 +1130,8 @@ struct btrfs_qgroup_swapped_blocks {
  * and for the extent tree extent_root root.
  */
 struct btrfs_root {
+	struct rb_node rb_node;
+
 	struct extent_buffer *node;
 
 	struct extent_buffer *commit_root;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 066df9869bb1..7bf01f4fc92e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1149,6 +1149,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->node = NULL;
 	root->commit_root = NULL;
 	root->state = 0;
+	RB_CLEAR_NODE(&root->rb_node);
 
 	root->last_trans = 0;
 	root->free_objectid = 0;
@@ -1242,6 +1243,81 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
 }
 #endif
 
+static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
+{
+	const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
+	const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
+
+	return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
+}
+
+static int global_root_key_cmp(const void *k, const struct rb_node *node)
+{
+	const struct btrfs_key *key = k;
+	const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
+
+	return btrfs_comp_cpu_keys(key, &root->root_key);
+}
+
+int btrfs_global_root_insert(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct rb_node *tmp;
+
+	write_lock(&fs_info->global_root_lock);
+	tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
+	write_unlock(&fs_info->global_root_lock);
+	ASSERT(!tmp);
+
+	return tmp ? -EEXIST : 0;
+}
+
+void btrfs_global_root_delete(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	write_lock(&fs_info->global_root_lock);
+	rb_erase(&root->rb_node, &fs_info->global_root_tree);
+	write_unlock(&fs_info->global_root_lock);
+}
+
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *key)
+{
+	struct rb_node *node;
+	struct btrfs_root *root = NULL;
+
+	read_lock(&fs_info->global_root_lock);
+	node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
+	if (node)
+		root = container_of(node, struct btrfs_root, rb_node);
+	read_unlock(&fs_info->global_root_lock);
+
+	return root;
+}
+
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+	struct btrfs_key key = {
+		.objectid = BTRFS_CSUM_TREE_OBJECTID,
+		.type = BTRFS_ROOT_ITEM_KEY,
+		.offset = 0,
+	};
+
+	return btrfs_global_root(fs_info, &key);
+}
+
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+	struct btrfs_key key = {
+		.objectid = BTRFS_EXTENT_TREE_OBJECTID,
+		.type = BTRFS_ROOT_ITEM_KEY,
+		.offset = 0,
+	};
+
+	return btrfs_global_root(fs_info, &key);
+}
+
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid)
 {
@@ -1554,25 +1630,33 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
 static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 						u64 objectid)
 {
+	struct btrfs_key key = {
+		.objectid = objectid,
+		.type = BTRFS_ROOT_ITEM_KEY,
+		.offset = 0,
+	};
+
 	if (objectid == BTRFS_ROOT_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->tree_root);
 	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->_extent_root);
+		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
 	if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->chunk_root);
 	if (objectid == BTRFS_DEV_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->dev_root);
 	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->_csum_root);
+		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
 	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->quota_root) ?
 			fs_info->quota_root : ERR_PTR(-ENOENT);
 	if (objectid == BTRFS_UUID_TREE_OBJECTID)
 		return btrfs_grab_root(fs_info->uuid_root) ?
 			fs_info->uuid_root : ERR_PTR(-ENOENT);
-	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
-		return btrfs_grab_root(fs_info->_free_space_root) ?
-			fs_info->_free_space_root : ERR_PTR(-ENOENT);
+	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+		struct btrfs_root *root = btrfs_global_root(fs_info, &key);
+
+		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
+	}
 	return NULL;
 }
 
@@ -1619,6 +1703,18 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
 #endif
 }
 
+static void free_global_roots(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct rb_node *node;
+
+	while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
+		root = rb_entry(node, struct btrfs_root, rb_node);
+		rb_erase(&root->rb_node, &fs_info->global_root_tree);
+		btrfs_put_root(root);
+	}
+}
+
 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 {
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
@@ -1630,14 +1726,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_free_ref_cache(fs_info);
 	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
-	btrfs_put_root(fs_info->_extent_root);
+	free_global_roots(fs_info);
 	btrfs_put_root(fs_info->tree_root);
 	btrfs_put_root(fs_info->chunk_root);
 	btrfs_put_root(fs_info->dev_root);
-	btrfs_put_root(fs_info->_csum_root);
 	btrfs_put_root(fs_info->quota_root);
 	btrfs_put_root(fs_info->uuid_root);
-	btrfs_put_root(fs_info->_free_space_root);
 	btrfs_put_root(fs_info->fs_root);
 	btrfs_put_root(fs_info->data_reloc_root);
 	btrfs_check_leaked_roots(fs_info);
@@ -2162,21 +2256,29 @@ static void free_root_extent_buffers(struct btrfs_root *root)
 	}
 }
 
+static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root, *tmp;
+
+	rbtree_postorder_for_each_entry_safe(root, tmp,
+					     &fs_info->global_root_tree,
+					     rb_node)
+		free_root_extent_buffers(root);
+}
+
 /* helper to cleanup tree roots */
 static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 {
 	free_root_extent_buffers(info->tree_root);
 
+	free_global_root_pointers(info);
 	free_root_extent_buffers(info->dev_root);
-	free_root_extent_buffers(info->_extent_root);
-	free_root_extent_buffers(info->_csum_root);
 	free_root_extent_buffers(info->quota_root);
 	free_root_extent_buffers(info->uuid_root);
 	free_root_extent_buffers(info->fs_root);
 	free_root_extent_buffers(info->data_reloc_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
-	free_root_extent_buffers(info->_free_space_root);
 }
 
 void btrfs_put_root(struct btrfs_root *root)
@@ -2437,6 +2539,104 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+static int load_global_roots_objectid(struct btrfs_root *tree_root,
+				      struct btrfs_path *path, u64 objectid,
+				      const char *name)
+{
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_root *root;
+	int ret;
+	struct btrfs_key key = {
+		.objectid = objectid,
+		.type = BTRFS_ROOT_ITEM_KEY,
+		.offset = 0,
+	};
+	bool found = false;
+
+	/* If we have IGNOREDATACSUMS skip loading these roots. */
+	if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
+	    btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+		set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+		return 0;
+	}
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(tree_root, path);
+			if (ret) {
+				if (ret > 0)
+					ret = 0;
+				break;
+			}
+		}
+		ret = 0;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid != objectid)
+			break;
+		btrfs_release_path(path);
+
+		found = true;
+		root = read_tree_root_path(tree_root, path, &key);
+		if (IS_ERR(root)) {
+			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+				ret = PTR_ERR(root);
+			break;
+		}
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		ret = btrfs_global_root_insert(root);
+		if (ret) {
+			btrfs_put_root(root);
+			break;
+		}
+		key.offset++;
+	}
+	btrfs_release_path(path);
+
+	if (!found || ret) {
+		if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+			set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+
+		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+			ret = ret ? ret : -ENOENT;
+		else
+			ret = 0;
+		btrfs_err(fs_info, "failed to load root %s", name);
+	}
+	return ret;
+}
+
+static int load_global_roots(struct btrfs_root *tree_root)
+{
+	struct btrfs_path *path;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = load_global_roots_objectid(tree_root, path,
+					 BTRFS_EXTENT_TREE_OBJECTID, "extent");
+	if (ret)
+		goto out;
+	ret = load_global_roots_objectid(tree_root, path,
+					 BTRFS_CSUM_TREE_OBJECTID, "csum");
+	if (ret)
+		goto out;
+	if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
+		goto out;
+	ret = load_global_roots_objectid(tree_root, path,
+					 BTRFS_FREE_SPACE_TREE_OBJECTID,
+					 "free space");
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *tree_root = fs_info->tree_root;
@@ -2446,22 +2646,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 
 	BUG_ON(!fs_info->tree_root);
 
-	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	ret = load_global_roots(tree_root);
+	if (ret)
+		return ret;
+
+	location.objectid = BTRFS_DEV_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
 	location.offset = 0;
 
-	root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(root)) {
-		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
-			ret = PTR_ERR(root);
-			goto out;
-		}
-	} else {
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		fs_info->_extent_root = root;
-	}
-
-	location.objectid = BTRFS_DEV_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (IS_ERR(root)) {
 		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
@@ -2475,26 +2667,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	/* Initialize fs_info for all devices in any case */
 	btrfs_init_devices_late(fs_info);
 
-	/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
-	if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
-		location.objectid = BTRFS_CSUM_TREE_OBJECTID;
-		root = btrfs_read_tree_root(tree_root, &location);
-		if (IS_ERR(root)) {
-			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
-				ret = PTR_ERR(root);
-				goto out;
-			} else {
-				set_bit(BTRFS_FS_STATE_NO_CSUMS,
-					&fs_info->fs_state);
-			}
-		} else {
-			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-			fs_info->_csum_root = root;
-		}
-	} else {
-		set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-	}
-
 	/*
 	 * This tree can share blocks with some other fs tree during relocation
 	 * and we need a proper setup by btrfs_get_fs_root
@@ -2532,20 +2704,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		fs_info->uuid_root = root;
 	}
 
-	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
-		location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
-		root = btrfs_read_tree_root(tree_root, &location);
-		if (IS_ERR(root)) {
-			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
-				ret = PTR_ERR(root);
-				goto out;
-			}
-		}  else {
-			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-			fs_info->_free_space_root = root;
-		}
-	}
-
 	return 0;
 out:
 	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2900,6 +3058,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	spin_lock_init(&fs_info->zone_active_bgs_lock);
 	spin_lock_init(&fs_info->relocation_bg_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
+	rwlock_init(&fs_info->global_root_lock);
 	mutex_init(&fs_info->unused_bg_unpin_mutex);
 	mutex_init(&fs_info->reclaim_bgs_lock);
 	mutex_init(&fs_info->reloc_mutex);
@@ -2934,6 +3093,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	atomic_set(&fs_info->reada_works_cnt, 0);
 	atomic_set(&fs_info->nr_delayed_iputs, 0);
 	atomic64_set(&fs_info->tree_mod_seq, 0);
+	fs_info->global_root_tree = RB_ROOT;
 	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
 	fs_info->metadata_ratio = 0;
 	fs_info->defrag_inodes = RB_ROOT;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a4d1788acd24..5e8bef4b7563 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,12 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
 						 struct btrfs_path *path,
 						 u64 objectid);
+int btrfs_global_root_insert(struct btrfs_root *root);
+void btrfs_global_root_delete(struct btrfs_root *root);
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *key);
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
 
 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@@ -103,18 +109,6 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 	return NULL;
 }
 
-static inline struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info,
-						   u64 bytenr)
-{
-	return fs_info->_extent_root;
-}
-
-static inline struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info,
-						 u64 bytenr)
-{
-	return fs_info->_csum_root;
-}
-
 static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
 {
 	return btrfs_extent_root(fs_info, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c17caa954dd5..7f08c9e8eb28 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2947,6 +2947,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
 
 	extent_root = btrfs_extent_root(info, bytenr);
+	ASSERT(extent_root);
 
 	path = btrfs_alloc_path();
 	if (!path)
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 51b7cb165e9b..655aad0f9e1c 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -19,7 +19,13 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 static struct btrfs_root *btrfs_free_space_root(
 				struct btrfs_block_group *block_group)
 {
-	return block_group->fs_info->_free_space_root;
+	struct btrfs_key key = {
+		.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+		.type = BTRFS_ROOT_ITEM_KEY,
+		.offset = 0,
+	};
+
+	return btrfs_global_root(block_group->fs_info, &key);
 }
 
 void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
@@ -1164,7 +1170,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 		ret = PTR_ERR(free_space_root);
 		goto abort;
 	}
-	fs_info->_free_space_root = free_space_root;
+	ret = btrfs_global_root_insert(free_space_root);
+	if (ret) {
+		btrfs_put_root(free_space_root);
+		goto abort;
+	}
 
 	node = rb_first(&fs_info->block_group_cache_tree);
 	while (node) {
@@ -1239,7 +1249,12 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *tree_root = fs_info->tree_root;
-	struct btrfs_root *free_space_root = fs_info->_free_space_root;
+	struct btrfs_key key = {
+		.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+		.type = BTRFS_ROOT_ITEM_KEY,
+		.offset = 0,
+	};
+	struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
 	int ret;
 
 	trans = btrfs_start_transaction(tree_root, 0);
@@ -1248,7 +1263,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
 
 	btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
 	btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
-	fs_info->_free_space_root = NULL;
 
 	ret = clear_free_space_tree(trans, free_space_root);
 	if (ret)
@@ -1258,6 +1272,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
 	if (ret)
 		goto abort;
 
+	btrfs_global_root_delete(free_space_root);
 	list_del(&free_space_root->dirty_list);
 
 	btrfs_tree_lock(free_space_root->node);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 3a4099a2bf05..d8e56edd6991 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -204,6 +204,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
 	/* Will be freed by btrfs_free_fs_roots */
 	if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
 		return;
+	btrfs_global_root_delete(root);
 	btrfs_put_root(root);
 }
 
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 13428dec04d3..5930cdcae5cb 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -1036,7 +1036,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
 		goto out;
 	}
 
-	root->fs_info->_extent_root = root;
+	root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
+	btrfs_global_root_insert(root);
 
 	ret = test_extents(cache);
 	if (ret)
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 7d6de8b53038..13734ed43bfc 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -446,7 +446,10 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
 
 	btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
 					BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
-	root->fs_info->_free_space_root = root;
+	root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
+	btrfs_global_root_insert(root);
 	root->fs_info->tree_root = root;
 
 	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 88e19781e83f..eee1e4459541 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -455,7 +455,10 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
 	}
 
 	/* We are using this root as our extent root */
-	root->fs_info->_extent_root = root;
+	root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
+	btrfs_global_root_insert(root);
 
 	/*
 	 * Some of the paths we test assume we have a filled out fs_info, so we
-- 
cgit v1.2.3-59-g8ed1b


From f594f13c194e50186d52e7995c9b7fbdd315a8dc Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 23 Nov 2021 14:44:19 +0200
Subject: btrfs: consolidate bitmap_clear_bits/__bitmap_clear_bits

The only difference is the former adjusts btrfs_free_space::bytes
member. Consolidate the two function into 1 and add a bool parameter
which controls whether the adjustment is made or not. No functional
changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a45017b12185..3e66417bc6c4 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -45,7 +45,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
 			struct btrfs_free_space *bitmap_info);
 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_free_space *info, u64 offset,
-			      u64 bytes);
+			      u64 bytes, bool update_stats);
 
 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
 					       struct btrfs_path *path,
@@ -886,7 +886,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
 							   bytes);
 				if (ret)
 					break;
-				bitmap_clear_bits(ctl, info, offset, bytes);
+				bitmap_clear_bits(ctl, info, offset, bytes, true);
 				offset = info->offset;
 				bytes = ctl->unit;
 			}
@@ -1803,9 +1803,9 @@ static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
 	rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
 }
 
-static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
-				       struct btrfs_free_space *info,
-				       u64 offset, u64 bytes)
+static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+				     struct btrfs_free_space *info,
+				     u64 offset, u64 bytes, bool update_stat)
 {
 	unsigned long start, count, end;
 	int extent_delta = -1;
@@ -1834,14 +1834,9 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 		ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
 	}
-}
 
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
-			      struct btrfs_free_space *info, u64 offset,
-			      u64 bytes)
-{
-	__bitmap_clear_bits(ctl, info, offset, bytes);
-	ctl->free_space -= bytes;
+	if (update_stat)
+		ctl->free_space -= bytes;
 }
 
 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
@@ -2112,7 +2107,7 @@ again:
 	/* Cannot clear past the end of the bitmap */
 	search_bytes = min(search_bytes, end - search_start + 1);
 
-	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
 	*offset += search_bytes;
 	*bytes -= search_bytes;
 
@@ -2457,10 +2452,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
 	if (!btrfs_free_space_trimmed(bitmap))
 		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
 
-	if (update_stat)
-		bitmap_clear_bits(ctl, bitmap, end, bytes);
-	else
-		__bitmap_clear_bits(ctl, bitmap, end, bytes);
+	bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
 
 	if (!bitmap->bytes)
 		free_bitmap(ctl, bitmap);
@@ -2514,10 +2506,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
 	if (!btrfs_free_space_trimmed(bitmap))
 		info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
 
-	if (update_stat)
-		bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
-	else
-		__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+	bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
 
 	if (!bitmap->bytes)
 		free_bitmap(ctl, bitmap);
@@ -3077,7 +3066,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 
 	ret = offset;
 	if (entry->bitmap) {
-		bitmap_clear_bits(ctl, entry, offset, bytes);
+		bitmap_clear_bits(ctl, entry, offset, bytes, true);
 
 		if (!btrfs_free_space_trimmed(entry))
 			atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
@@ -3179,7 +3168,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
 	}
 
 	ret = search_start;
-	__bitmap_clear_bits(ctl, entry, ret, bytes);
+	bitmap_clear_bits(ctl, entry, ret, bytes, false);
 
 	return ret;
 }
@@ -3941,7 +3930,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
 		    bytes > (max_discard_size + minlen))
 			bytes = max_discard_size;
 
-		bitmap_clear_bits(ctl, entry, start, bytes);
+		bitmap_clear_bits(ctl, entry, start, bytes, true);
 		if (entry->bytes == 0)
 			free_bitmap(ctl, entry);
 
-- 
cgit v1.2.3-59-g8ed1b


From 32e1649b5356d805320b58b407a9bdb164d270f5 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 23 Nov 2021 14:44:20 +0200
Subject: btrfs: consolidate unlink_free_space/__unlink_free_space functions

The only difference between the two is whether btrfs_free_space::bytes
is adjusted. Instead of having 2 separate functions control this
behavior via an additional parameter and make them one function instead.
No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 40 +++++++++++++++-------------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3e66417bc6c4..9acc592ba49f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -37,7 +37,7 @@ struct btrfs_trim_range {
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
 			   struct btrfs_free_space *info);
 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
-			      struct btrfs_free_space *info);
+			      struct btrfs_free_space *info, bool update_stat);
 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			 struct btrfs_free_space *bitmap_info, u64 *offset,
 			 u64 *bytes, bool for_alloc);
@@ -872,7 +872,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
 	while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (!info->bitmap) {
-			unlink_free_space(ctl, info);
+			unlink_free_space(ctl, info, true);
 			ret = btrfs_add_free_space(block_group, info->offset,
 						   info->bytes);
 			kmem_cache_free(btrfs_free_space_cachep, info);
@@ -1743,9 +1743,9 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 	return entry;
 }
 
-static inline void
-__unlink_free_space(struct btrfs_free_space_ctl *ctl,
-		    struct btrfs_free_space *info)
+static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+				     struct btrfs_free_space *info,
+				     bool update_stat)
 {
 	rb_erase(&info->offset_index, &ctl->free_space_offset);
 	rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
@@ -1755,13 +1755,9 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
 		ctl->discardable_extents[BTRFS_STAT_CURR]--;
 		ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
 	}
-}
 
-static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
-			      struct btrfs_free_space *info)
-{
-	__unlink_free_space(ctl, info);
-	ctl->free_space -= info->bytes;
+	if (update_stat)
+		ctl->free_space -= info->bytes;
 }
 
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
@@ -2069,7 +2065,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
 		ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
 
 	}
-	unlink_free_space(ctl, bitmap_info);
+	unlink_free_space(ctl, bitmap_info, true);
 	kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
 	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
 	ctl->total_bitmaps--;
@@ -2400,10 +2396,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
 	/* See try_merge_free_space() comment. */
 	if (right_info && !right_info->bitmap &&
 	    (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
-		if (update_stat)
-			unlink_free_space(ctl, right_info);
-		else
-			__unlink_free_space(ctl, right_info);
+		unlink_free_space(ctl, right_info, update_stat);
 		info->bytes += right_info->bytes;
 		kmem_cache_free(btrfs_free_space_cachep, right_info);
 		merged = true;
@@ -2413,10 +2406,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
 	if (left_info && !left_info->bitmap &&
 	    left_info->offset + left_info->bytes == offset &&
 	    (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
-		if (update_stat)
-			unlink_free_space(ctl, left_info);
-		else
-			__unlink_free_space(ctl, left_info);
+		unlink_free_space(ctl, left_info, update_stat);
 		info->offset = left_info->offset;
 		info->bytes += left_info->bytes;
 		kmem_cache_free(btrfs_free_space_cachep, left_info);
@@ -2781,7 +2771,7 @@ again:
 
 	re_search = false;
 	if (!info->bitmap) {
-		unlink_free_space(ctl, info);
+		unlink_free_space(ctl, info, true);
 		if (offset == info->offset) {
 			u64 to_free = min(bytes, info->bytes);
 
@@ -2967,7 +2957,7 @@ static void __btrfs_remove_free_space_cache_locked(
 	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
 		info = rb_entry(node, struct btrfs_free_space, offset_index);
 		if (!info->bitmap) {
-			unlink_free_space(ctl, info);
+			unlink_free_space(ctl, info, true);
 			kmem_cache_free(btrfs_free_space_cachep, info);
 		} else {
 			free_bitmap(ctl, info);
@@ -3074,7 +3064,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
 		if (!entry->bytes)
 			free_bitmap(ctl, entry);
 	} else {
-		unlink_free_space(ctl, entry);
+		unlink_free_space(ctl, entry, true);
 		align_gap_len = offset - entry->offset;
 		align_gap = entry->offset;
 		align_gap_trim_state = entry->trim_state;
@@ -3717,7 +3707,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
 				mutex_unlock(&ctl->cache_writeout_mutex);
 				goto next;
 			}
-			unlink_free_space(ctl, entry);
+			unlink_free_space(ctl, entry, true);
 			/*
 			 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
 			 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
@@ -3743,7 +3733,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
 				goto next;
 			}
 
-			unlink_free_space(ctl, entry);
+			unlink_free_space(ctl, entry, true);
 			kmem_cache_free(btrfs_free_space_cachep, entry);
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From 290ef19add7649e2e75653b92ffcfa6417c451d1 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 23 Nov 2021 14:44:21 +0200
Subject: btrfs: make __btrfs_add_free_space take just block group reference

There is no point in the function taking an fs_info and a
btrfs_free_space because the ctl passed always belongs to the block
group. Furthermore fs_info can be referenced from the block group. No
functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 25 ++++++++++---------------
 fs/btrfs/free-space-cache.h |  6 ++----
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9acc592ba49f..053103be73c7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2540,12 +2540,12 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
 	}
 }
 
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
-			   struct btrfs_free_space_ctl *ctl,
+int __btrfs_add_free_space(struct btrfs_block_group *block_group,
 			   u64 offset, u64 bytes,
 			   enum btrfs_trim_state trim_state)
 {
-	struct btrfs_block_group *block_group = ctl->private;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *info;
 	int ret = 0;
 	u64 filter_bytes = bytes;
@@ -2677,9 +2677,7 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
 	if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
 		trim_state = BTRFS_TRIM_STATE_TRIMMED;
 
-	return __btrfs_add_free_space(block_group->fs_info,
-				      block_group->free_space_ctl,
-				      bytenr, size, trim_state);
+	return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
 }
 
 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
@@ -2710,9 +2708,7 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
 	    btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
 		trim_state = BTRFS_TRIM_STATE_TRIMMED;
 
-	return __btrfs_add_free_space(block_group->fs_info,
-				      block_group->free_space_ctl,
-				      bytenr, size, trim_state);
+	return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
 }
 
 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
@@ -2807,7 +2803,7 @@ again:
 			}
 			spin_unlock(&ctl->tree_lock);
 
-			ret = __btrfs_add_free_space(block_group->fs_info, ctl,
+			ret = __btrfs_add_free_space(block_group,
 						     offset + bytes,
 						     old_end - (offset + bytes),
 						     info->trim_state);
@@ -3086,8 +3082,7 @@ out:
 	spin_unlock(&ctl->tree_lock);
 
 	if (align_gap_len)
-		__btrfs_add_free_space(block_group->fs_info, ctl,
-				       align_gap, align_gap_len,
+		__btrfs_add_free_space(block_group, align_gap, align_gap_len,
 				       align_gap_trim_state);
 	return ret;
 }
@@ -3627,13 +3622,13 @@ static int do_trimming(struct btrfs_block_group *block_group,
 
 	mutex_lock(&ctl->cache_writeout_mutex);
 	if (reserved_start < start)
-		__btrfs_add_free_space(fs_info, ctl, reserved_start,
+		__btrfs_add_free_space(block_group, reserved_start,
 				       start - reserved_start,
 				       reserved_trim_state);
 	if (start + bytes < reserved_start + reserved_bytes)
-		__btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
+		__btrfs_add_free_space(block_group, end, reserved_end - end,
 				       reserved_trim_state);
-	__btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
+	__btrfs_add_free_space(block_group, start, bytes, trim_state);
 	list_del(&trim_entry->list);
 	mutex_unlock(&ctl->cache_writeout_mutex);
 
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index dd982d204d2d..b8a755ff523a 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -103,10 +103,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 
 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 			       struct btrfs_free_space_ctl *ctl);
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
-			   struct btrfs_free_space_ctl *ctl,
-			   u64 bytenr, u64 size,
-			   enum btrfs_trim_state trim_state);
+int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
+			   u64 size, enum btrfs_trim_state trim_state);
 int btrfs_add_free_space(struct btrfs_block_group *block_group,
 			 u64 bytenr, u64 size);
 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
-- 
cgit v1.2.3-59-g8ed1b


From 364be8421192d11ff2aa2973f00efc7cf3b01f52 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 23 Nov 2021 14:44:22 +0200
Subject: btrfs: change name and type of private member of btrfs_free_space_ctl

btrfs_free_space_ctl::private is either unset or it always points to
struct btrfs_block_group when it is set. So there's no point in keeping
the unhelpful 'private' name and keeping it an untyped pointer. Change
both the type and name to be self-describing. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 12 ++++++------
 fs/btrfs/free-space-cache.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 053103be73c7..132209ff2262 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -666,7 +666,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 
 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 {
-	struct btrfs_block_group *block_group = ctl->private;
+	struct btrfs_block_group *block_group = ctl->block_group;
 	u64 max_bytes;
 	u64 bitmap_bytes;
 	u64 extent_bytes;
@@ -2182,7 +2182,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 		      struct btrfs_free_space *info)
 {
-	struct btrfs_block_group *block_group = ctl->private;
+	struct btrfs_block_group *block_group = ctl->block_group;
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	bool forced = false;
 
@@ -2251,7 +2251,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
 		return 0;
 
 	if (ctl->op == &free_space_op)
-		block_group = ctl->private;
+		block_group = ctl->block_group;
 again:
 	/*
 	 * Since we link bitmaps right into the cluster we need to see if we
@@ -2868,7 +2868,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
 	spin_lock_init(&ctl->tree_lock);
 	ctl->unit = fs_info->sectorsize;
 	ctl->start = block_group->start;
-	ctl->private = block_group;
+	ctl->block_group = block_group;
 	ctl->op = &free_space_op;
 	ctl->free_space_bytes = RB_ROOT_CACHED;
 	INIT_LIST_HEAD(&ctl->trimming_ranges);
@@ -2967,8 +2967,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
 {
 	spin_lock(&ctl->tree_lock);
 	__btrfs_remove_free_space_cache_locked(ctl);
-	if (ctl->private)
-		btrfs_discard_update_discardable(ctl->private);
+	if (ctl->block_group)
+		btrfs_discard_update_discardable(ctl->block_group);
 	spin_unlock(&ctl->tree_lock);
 }
 
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index b8a755ff523a..15591b299895 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -56,7 +56,7 @@ struct btrfs_free_space_ctl {
 	s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
 	s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
 	const struct btrfs_free_space_op *op;
-	void *private;
+	struct btrfs_block_group *block_group;
 	struct mutex cache_writeout_mutex;
 	struct list_head trimming_ranges;
 };
-- 
cgit v1.2.3-59-g8ed1b


From d96b34248c2f4ea8cd09286090f2f6f77102eaab Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 22 Nov 2021 12:03:38 +0000
Subject: btrfs: make send work with concurrent block group relocation

We don't allow send and balance/relocation to run in parallel in order
to prevent send failing or silently producing some bad stream. This is
because while send is using an extent (specially metadata) or about to
read a metadata extent and expecting it belongs to a specific parent
node, relocation can run, the transaction used for the relocation is
committed and the extent gets reallocated while send is still using the
extent, so it ends up with a different content than expected. This can
result in just failing to read a metadata extent due to failure of the
validation checks (parent transid, level, etc), failure to find a
backreference for a data extent, and other unexpected failures. Besides
reallocation, there's also a similar problem of an extent getting
discarded when it's unpinned after the transaction used for block group
relocation is committed.

The restriction between balance and send was added in commit 9e967495e0e0
("Btrfs: prevent send failures and crashes due to concurrent relocation"),
kernel 5.3, while the more general restriction between send and relocation
was added in commit 1cea5cf0e664 ("btrfs: ensure relocation never runs
while we have send operations running"), kernel 5.14.

Both send and relocation can be very long running operations. Relocation
because it has to do a lot of IO and expensive backreference lookups in
case there are many snapshots, and send due to read IO when operating on
very large trees. This makes it inconvenient for users and tools to deal
with scheduling both operations.

For zoned filesystem we also have automatic block group relocation, so
send can fail with -EAGAIN when users least expect it or send can end up
delaying the block group relocation for too long. In the future we might
also get the automatic block group relocation for non zoned filesystems.

This change makes it possible for send and relocation to run in parallel.
This is achieved the following way:

1) For all tree searches, send acquires a read lock on the commit root
   semaphore;

2) After each tree search, and before releasing the commit root semaphore,
   the leaf is cloned and placed in the search path (struct btrfs_path);

3) After releasing the commit root semaphore, the changed_cb() callback
   is invoked, which operates on the leaf and writes commands to the pipe
   (or file in case send/receive is not used with a pipe). It's important
   here to not hold a lock on the commit root semaphore, because if we did
   we could deadlock when sending and receiving to the same filesystem
   using a pipe - the send task blocks on the pipe because it's full, the
   receive task, which is the only consumer of the pipe, triggers a
   transaction commit when attempting to create a subvolume or reserve
   space for a write operation for example, but the transaction commit
   blocks trying to write lock the commit root semaphore, resulting in a
   deadlock;

4) Before moving to the next key, or advancing to the next change in case
   of an incremental send, check if a transaction used for relocation was
   committed (or is about to finish its commit). If so, release the search
   path(s) and restart the search, to where we were before, so that we
   don't operate on stale extent buffers. The search restarts are always
   possible because both the send and parent roots are RO, and no one can
   add, remove of update keys (change their offset) in RO trees - the
   only exception is deduplication, but that is still not allowed to run
   in parallel with send;

5) Periodically check if there is contention on the commit root semaphore,
   which means there is a transaction commit trying to write lock it, and
   release the semaphore and reschedule if there is contention, so as to
   avoid causing any significant delays to transaction commits.

This leaves some room for optimizations for send to have less path
releases and re searching the trees when there's relocation running, but
for now it's kept simple as it performs quite well (on very large trees
with resulting send streams in the order of a few hundred gigabytes).

Test case btrfs/187, from fstests, stresses relocation, send and
deduplication attempting to run in parallel, but without verifying if send
succeeds and if it produces correct streams. A new test case will be added
that exercises relocation happening in parallel with send and then checks
that send succeeds and the resulting streams are correct.

A final note is that for now this still leaves the mutual exclusion
between send operations and deduplication on files belonging to a root
used by send operations. A solution for that will be slightly more complex
but it will eventually be built on top of this change.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c |   9 +-
 fs/btrfs/ctree.c       |  98 ++++++++++----
 fs/btrfs/ctree.h       |  14 +-
 fs/btrfs/disk-io.c     |   4 +-
 fs/btrfs/relocation.c  |  13 --
 fs/btrfs/send.c        | 357 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/transaction.c |   4 +
 7 files changed, 395 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 7d6372103aaf..1db24e6d6d90 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1509,7 +1509,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
 	struct btrfs_block_group *bg;
 	struct btrfs_space_info *space_info;
-	LIST_HEAD(again_list);
 
 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
 		return;
@@ -1586,18 +1585,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 				div64_u64(zone_unusable * 100, bg->length));
 		trace_btrfs_reclaim_block_group(bg);
 		ret = btrfs_relocate_chunk(fs_info, bg->start);
-		if (ret && ret != -EAGAIN)
+		if (ret)
 			btrfs_err(fs_info, "error relocating chunk %llu",
 				  bg->start);
 
 next:
+		btrfs_put_block_group(bg);
 		spin_lock(&fs_info->unused_bgs_lock);
-		if (ret == -EAGAIN && list_empty(&bg->bg_list))
-			list_add_tail(&bg->bg_list, &again_list);
-		else
-			btrfs_put_block_group(bg);
 	}
-	list_splice_tail(&again_list, &fs_info->reclaim_bgs);
 	spin_unlock(&fs_info->unused_bgs_lock);
 	mutex_unlock(&fs_info->reclaim_bgs_lock);
 	btrfs_exclop_finish(fs_info);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 94b2be118b50..9e02ac51c664 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1568,7 +1568,6 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
 							struct btrfs_path *p,
 							int write_lock_level)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *b;
 	int root_lock;
 	int level = 0;
@@ -1577,26 +1576,8 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
 	root_lock = BTRFS_READ_LOCK;
 
 	if (p->search_commit_root) {
-		/*
-		 * The commit roots are read only so we always do read locks,
-		 * and we always must hold the commit_root_sem when doing
-		 * searches on them, the only exception is send where we don't
-		 * want to block transaction commits for a long time, so
-		 * we need to clone the commit root in order to avoid races
-		 * with transaction commits that create a snapshot of one of
-		 * the roots used by a send operation.
-		 */
-		if (p->need_commit_sem) {
-			down_read(&fs_info->commit_root_sem);
-			b = btrfs_clone_extent_buffer(root->commit_root);
-			up_read(&fs_info->commit_root_sem);
-			if (!b)
-				return ERR_PTR(-ENOMEM);
-
-		} else {
-			b = root->commit_root;
-			atomic_inc(&b->refs);
-		}
+		b = root->commit_root;
+		atomic_inc(&b->refs);
 		level = btrfs_header_level(b);
 		/*
 		 * Ensure that all callers have set skip_locking when
@@ -1648,6 +1629,42 @@ out:
 	return b;
 }
 
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+	const int i = path->lowest_level;
+	const int slot = path->slots[i];
+	struct extent_buffer *lowest = path->nodes[i];
+	struct extent_buffer *clone;
+
+	ASSERT(path->need_commit_sem);
+
+	if (!lowest)
+		return 0;
+
+	lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+	clone = btrfs_clone_extent_buffer(lowest);
+	if (!clone)
+		return -ENOMEM;
+
+	btrfs_release_path(path);
+	path->nodes[i] = clone;
+	path->slots[i] = slot;
+
+	return 0;
+}
 
 /*
  * btrfs_search_slot - look for a key in a tree and perform necessary
@@ -1684,6 +1701,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key, struct btrfs_path *p,
 		      int ins_len, int cow)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *b;
 	int slot;
 	int ret;
@@ -1725,6 +1743,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 	min_write_lock_level = write_lock_level;
 
+	if (p->need_commit_sem) {
+		ASSERT(p->search_commit_root);
+		down_read(&fs_info->commit_root_sem);
+	}
+
 again:
 	prev_cmp = -1;
 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
@@ -1919,6 +1942,16 @@ cow_done:
 done:
 	if (ret < 0 && !p->skip_release_on_error)
 		btrfs_release_path(p);
+
+	if (p->need_commit_sem) {
+		int ret2;
+
+		ret2 = finish_need_commit_sem_search(p);
+		up_read(&fs_info->commit_root_sem);
+		if (ret2)
+			ret = ret2;
+	}
+
 	return ret;
 }
 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
@@ -4373,7 +4406,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	int level;
 	struct extent_buffer *c;
 	struct extent_buffer *next;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_key key;
+	bool need_commit_sem = false;
 	u32 nritems;
 	int ret;
 	int i;
@@ -4390,14 +4425,20 @@ again:
 
 	path->keep_locks = 1;
 
-	if (time_seq)
+	if (time_seq) {
 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
-	else
+	} else {
+		if (path->need_commit_sem) {
+			path->need_commit_sem = 0;
+			need_commit_sem = true;
+			down_read(&fs_info->commit_root_sem);
+		}
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	}
 	path->keep_locks = 0;
 
 	if (ret < 0)
-		return ret;
+		goto done;
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	/*
@@ -4520,6 +4561,15 @@ again:
 	ret = 0;
 done:
 	unlock_up(path, 0, 1, 0, NULL);
+	if (need_commit_sem) {
+		int ret2;
+
+		path->need_commit_sem = 1;
+		ret2 = finish_need_commit_sem_search(path);
+		up_read(&fs_info->commit_root_sem);
+		if (ret2)
+			ret = ret2;
+	}
 
 	return ret;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aa303669615d..0c92158cb50c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -572,7 +572,6 @@ enum {
 	/*
 	 * Indicate that relocation of a chunk has started, it's set per chunk
 	 * and is toggled between chunks.
-	 * Set, tested and cleared while holding fs_info::send_reloc_lock.
 	 */
 	BTRFS_FS_RELOC_RUNNING,
 
@@ -673,6 +672,12 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
+	/*
+	 * Generation of the last transaction used for block group relocation
+	 * since the filesystem was last mounted (or 0 if none happened yet).
+	 * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+	 */
+	u64 last_reloc_trans;
 	u64 avg_delayed_ref_runtime;
 
 	/*
@@ -1003,13 +1008,6 @@ struct btrfs_fs_info {
 
 	struct crypto_shash *csum_shash;
 
-	spinlock_t send_reloc_lock;
-	/*
-	 * Number of send operations in progress.
-	 * Updated while holding fs_info::send_reloc_lock.
-	 */
-	int send_in_progress;
-
 	/* Type of exclusive operation running, protected by super_lock */
 	enum btrfs_exclusive_operation exclusive_operation;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7bf01f4fc92e..fc7dd5109806 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3023,6 +3023,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 		/* All successful */
 		fs_info->generation = generation;
 		fs_info->last_trans_committed = generation;
+		fs_info->last_reloc_trans = 0;
 
 		/* Always begin writing backup roots after the one being used */
 		if (backup_index < 0) {
@@ -3159,9 +3160,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	spin_lock_init(&fs_info->swapfile_pins_lock);
 	fs_info->swapfile_pins = RB_ROOT;
 
-	spin_lock_init(&fs_info->send_reloc_lock);
-	fs_info->send_in_progress = 0;
-
 	fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
 	INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 831c55708c88..775aff5a2c26 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3859,25 +3859,14 @@ out:
  *   0             success
  *   -EINPROGRESS  operation is already in progress, that's probably a bug
  *   -ECANCELED    cancellation request was set before the operation started
- *   -EAGAIN       can not start because there are ongoing send operations
  */
 static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
 {
-	spin_lock(&fs_info->send_reloc_lock);
-	if (fs_info->send_in_progress) {
-		btrfs_warn_rl(fs_info,
-"cannot run relocation while send operations are in progress (%d in progress)",
-			      fs_info->send_in_progress);
-		spin_unlock(&fs_info->send_reloc_lock);
-		return -EAGAIN;
-	}
 	if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
 		/* This should not happen */
-		spin_unlock(&fs_info->send_reloc_lock);
 		btrfs_err(fs_info, "reloc already running, cannot start");
 		return -EINPROGRESS;
 	}
-	spin_unlock(&fs_info->send_reloc_lock);
 
 	if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
 		btrfs_info(fs_info, "chunk relocation canceled on start");
@@ -3899,9 +3888,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
 	/* Requested after start, clear bit first so any waiters can continue */
 	if (atomic_read(&fs_info->reloc_cancel_req) > 0)
 		btrfs_info(fs_info, "chunk relocation canceled during operation");
-	spin_lock(&fs_info->send_reloc_lock);
 	clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
-	spin_unlock(&fs_info->send_reloc_lock);
 	atomic_set(&fs_info->reloc_cancel_req, 0);
 }
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6bdcb9d481d5..d8ccb62aa7d2 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -24,6 +24,7 @@
 #include "transaction.h"
 #include "compression.h"
 #include "xattr.h"
+#include "print-tree.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
@@ -97,6 +98,15 @@ struct send_ctx {
 	struct btrfs_path *right_path;
 	struct btrfs_key *cmp_key;
 
+	/*
+	 * Keep track of the generation of the last transaction that was used
+	 * for relocating a block group. This is periodically checked in order
+	 * to detect if a relocation happened since the last check, so that we
+	 * don't operate on stale extent buffers for nodes (level >= 1) or on
+	 * stale disk_bytenr values of file extent items.
+	 */
+	u64 last_reloc_trans;
+
 	/*
 	 * infos of the currently processed inode. In case of deleted inodes,
 	 * these are the values from the deleted inode.
@@ -1421,6 +1431,26 @@ static int find_extent_clone(struct send_ctx *sctx,
 	if (ret < 0)
 		goto out;
 
+	down_read(&fs_info->commit_root_sem);
+	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+		/*
+		 * A transaction commit for a transaction in which block group
+		 * relocation was done just happened.
+		 * The disk_bytenr of the file extent item we processed is
+		 * possibly stale, referring to the extent's location before
+		 * relocation. So act as if we haven't found any clone sources
+		 * and fallback to write commands, which will read the correct
+		 * data from the new extent location. Otherwise we will fail
+		 * below because we haven't found our own back reference or we
+		 * could be getting incorrect sources in case the old extent
+		 * was already reallocated after the relocation.
+		 */
+		up_read(&fs_info->commit_root_sem);
+		ret = -ENOENT;
+		goto out;
+	}
+	up_read(&fs_info->commit_root_sem);
+
 	if (!backref_ctx.found_itself) {
 		/* found a bug in backref code? */
 		ret = -EIO;
@@ -6583,6 +6613,50 @@ static int changed_cb(struct btrfs_path *left_path,
 {
 	int ret = 0;
 
+	/*
+	 * We can not hold the commit root semaphore here. This is because in
+	 * the case of sending and receiving to the same filesystem, using a
+	 * pipe, could result in a deadlock:
+	 *
+	 * 1) The task running send blocks on the pipe because it's full;
+	 *
+	 * 2) The task running receive, which is the only consumer of the pipe,
+	 *    is waiting for a transaction commit (for example due to a space
+	 *    reservation when doing a write or triggering a transaction commit
+	 *    when creating a subvolume);
+	 *
+	 * 3) The transaction is waiting to write lock the commit root semaphore,
+	 *    but can not acquire it since it's being held at 1).
+	 *
+	 * Down this call chain we write to the pipe through kernel_write().
+	 * The same type of problem can also happen when sending to a file that
+	 * is stored in the same filesystem - when reserving space for a write
+	 * into the file, we can trigger a transaction commit.
+	 *
+	 * Our caller has supplied us with clones of leaves from the send and
+	 * parent roots, so we're safe here from a concurrent relocation and
+	 * further reallocation of metadata extents while we are here. Below we
+	 * also assert that the leaves are clones.
+	 */
+	lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
+
+	/*
+	 * We always have a send root, so left_path is never NULL. We will not
+	 * have a leaf when we have reached the end of the send root but have
+	 * not yet reached the end of the parent root.
+	 */
+	if (left_path->nodes[0])
+		ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+				&left_path->nodes[0]->bflags));
+	/*
+	 * When doing a full send we don't have a parent root, so right_path is
+	 * NULL. When doing an incremental send, we may have reached the end of
+	 * the parent root already, so we don't have a leaf at right_path.
+	 */
+	if (right_path && right_path->nodes[0])
+		ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+				&right_path->nodes[0]->bflags));
+
 	if (result == BTRFS_COMPARE_TREE_SAME) {
 		if (key->type == BTRFS_INODE_REF_KEY ||
 		    key->type == BTRFS_INODE_EXTREF_KEY) {
@@ -6629,14 +6703,46 @@ out:
 	return ret;
 }
 
+static int search_key_again(const struct send_ctx *sctx,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    const struct btrfs_key *key)
+{
+	int ret;
+
+	if (!path->need_commit_sem)
+		lockdep_assert_held_read(&root->fs_info->commit_root_sem);
+
+	/*
+	 * Roots used for send operations are readonly and no one can add,
+	 * update or remove keys from them, so we should be able to find our
+	 * key again. The only exception is deduplication, which can operate on
+	 * readonly roots and add, update or remove keys to/from them - but at
+	 * the moment we don't allow it to run in parallel with send.
+	 */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	ASSERT(ret <= 0);
+	if (ret > 0) {
+		btrfs_print_tree(path->nodes[path->lowest_level], false);
+		btrfs_err(root->fs_info,
+"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
+			  key->objectid, key->type, key->offset,
+			  (root == sctx->parent_root ? "parent" : "send"),
+			  root->root_key.objectid, path->lowest_level,
+			  path->slots[path->lowest_level]);
+		return -EUCLEAN;
+	}
+
+	return ret;
+}
+
 static int full_send_tree(struct send_ctx *sctx)
 {
 	int ret;
 	struct btrfs_root *send_root = sctx->send_root;
 	struct btrfs_key key;
+	struct btrfs_fs_info *fs_info = send_root->fs_info;
 	struct btrfs_path *path;
-	struct extent_buffer *eb;
-	int slot;
 
 	path = alloc_path_for_send();
 	if (!path)
@@ -6647,6 +6753,10 @@ static int full_send_tree(struct send_ctx *sctx)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
+	down_read(&fs_info->commit_root_sem);
+	sctx->last_reloc_trans = fs_info->last_reloc_trans;
+	up_read(&fs_info->commit_root_sem);
+
 	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
 	if (ret < 0)
 		goto out;
@@ -6654,15 +6764,35 @@ static int full_send_tree(struct send_ctx *sctx)
 		goto out_finish;
 
 	while (1) {
-		eb = path->nodes[0];
-		slot = path->slots[0];
-		btrfs_item_key_to_cpu(eb, &key, slot);
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 
 		ret = changed_cb(path, NULL, &key,
 				 BTRFS_COMPARE_TREE_NEW, sctx);
 		if (ret < 0)
 			goto out;
 
+		down_read(&fs_info->commit_root_sem);
+		if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+			sctx->last_reloc_trans = fs_info->last_reloc_trans;
+			up_read(&fs_info->commit_root_sem);
+			/*
+			 * A transaction used for relocating a block group was
+			 * committed or is about to finish its commit. Release
+			 * our path (leaf) and restart the search, so that we
+			 * avoid operating on any file extent items that are
+			 * stale, with a disk_bytenr that reflects a pre
+			 * relocation value. This way we avoid as much as
+			 * possible to fallback to regular writes when checking
+			 * if we can clone file ranges.
+			 */
+			btrfs_release_path(path);
+			ret = search_key_again(sctx, send_root, path, &key);
+			if (ret < 0)
+				goto out;
+		} else {
+			up_read(&fs_info->commit_root_sem);
+		}
+
 		ret = btrfs_next_item(send_root, path);
 		if (ret < 0)
 			goto out;
@@ -6680,6 +6810,20 @@ out:
 	return ret;
 }
 
+static int replace_node_with_clone(struct btrfs_path *path, int level)
+{
+	struct extent_buffer *clone;
+
+	clone = btrfs_clone_extent_buffer(path->nodes[level]);
+	if (!clone)
+		return -ENOMEM;
+
+	free_extent_buffer(path->nodes[level]);
+	path->nodes[level] = clone;
+
+	return 0;
+}
+
 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
 {
 	struct extent_buffer *eb;
@@ -6689,6 +6833,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
 	u64 reada_max;
 	u64 reada_done = 0;
 
+	lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+
 	BUG_ON(*level == 0);
 	eb = btrfs_read_node_slot(parent, slot);
 	if (IS_ERR(eb))
@@ -6712,6 +6858,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
 	path->nodes[*level - 1] = eb;
 	path->slots[*level - 1] = 0;
 	(*level)--;
+
+	if (*level == 0)
+		return replace_node_with_clone(path, 0);
+
 	return 0;
 }
 
@@ -6725,8 +6875,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
 	path->slots[*level]++;
 
 	while (path->slots[*level] >= nritems) {
-		if (*level == root_level)
+		if (*level == root_level) {
+			path->slots[*level] = nritems - 1;
 			return -1;
+		}
 
 		/* move upnext */
 		path->slots[*level] = 0;
@@ -6758,14 +6910,20 @@ static int tree_advance(struct btrfs_path *path,
 	} else {
 		ret = tree_move_down(path, level, reada_min_gen);
 	}
-	if (ret >= 0) {
-		if (*level == 0)
-			btrfs_item_key_to_cpu(path->nodes[*level], key,
-					path->slots[*level]);
-		else
-			btrfs_node_key_to_cpu(path->nodes[*level], key,
-					path->slots[*level]);
-	}
+
+	/*
+	 * Even if we have reached the end of a tree, ret is -1, update the key
+	 * anyway, so that in case we need to restart due to a block group
+	 * relocation, we can assert that the last key of the root node still
+	 * exists in the tree.
+	 */
+	if (*level == 0)
+		btrfs_item_key_to_cpu(path->nodes[*level], key,
+				      path->slots[*level]);
+	else
+		btrfs_node_key_to_cpu(path->nodes[*level], key,
+				      path->slots[*level]);
+
 	return ret;
 }
 
@@ -6794,6 +6952,97 @@ static int tree_compare_item(struct btrfs_path *left_path,
 	return 0;
 }
 
+/*
+ * A transaction used for relocating a block group was committed or is about to
+ * finish its commit. Release our paths and restart the search, so that we are
+ * not using stale extent buffers:
+ *
+ * 1) For levels > 0, we are only holding references of extent buffers, without
+ *    any locks on them, which does not prevent them from having been relocated
+ *    and reallocated after the last time we released the commit root semaphore.
+ *    The exception are the root nodes, for which we always have a clone, see
+ *    the comment at btrfs_compare_trees();
+ *
+ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
+ *    we are safe from the concurrent relocation and reallocation. However they
+ *    can have file extent items with a pre relocation disk_bytenr value, so we
+ *    restart the start from the current commit roots and clone the new leaves so
+ *    that we get the post relocation disk_bytenr values. Not doing so, could
+ *    make us clone the wrong data in case there are new extents using the old
+ *    disk_bytenr that happen to be shared.
+ */
+static int restart_after_relocation(struct btrfs_path *left_path,
+				    struct btrfs_path *right_path,
+				    const struct btrfs_key *left_key,
+				    const struct btrfs_key *right_key,
+				    int left_level,
+				    int right_level,
+				    const struct send_ctx *sctx)
+{
+	int root_level;
+	int ret;
+
+	lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
+
+	btrfs_release_path(left_path);
+	btrfs_release_path(right_path);
+
+	/*
+	 * Since keys can not be added or removed to/from our roots because they
+	 * are readonly and we do not allow deduplication to run in parallel
+	 * (which can add, remove or change keys), the layout of the trees should
+	 * not change.
+	 */
+	left_path->lowest_level = left_level;
+	ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
+	if (ret < 0)
+		return ret;
+
+	right_path->lowest_level = right_level;
+	ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If the lowest level nodes are leaves, clone them so that they can be
+	 * safely used by changed_cb() while not under the protection of the
+	 * commit root semaphore, even if relocation and reallocation happens in
+	 * parallel.
+	 */
+	if (left_level == 0) {
+		ret = replace_node_with_clone(left_path, 0);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (right_level == 0) {
+		ret = replace_node_with_clone(right_path, 0);
+		if (ret < 0)
+			return ret;
+	}
+
+	/*
+	 * Now clone the root nodes (unless they happen to be the leaves we have
+	 * already cloned). This is to protect against concurrent snapshotting of
+	 * the send and parent roots (see the comment at btrfs_compare_trees()).
+	 */
+	root_level = btrfs_header_level(sctx->send_root->commit_root);
+	if (root_level > 0) {
+		ret = replace_node_with_clone(left_path, root_level);
+		if (ret < 0)
+			return ret;
+	}
+
+	root_level = btrfs_header_level(sctx->parent_root->commit_root);
+	if (root_level > 0) {
+		ret = replace_node_with_clone(right_path, root_level);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 /*
  * This function compares two trees and calls the provided callback for
  * every changed/new/deleted item it finds.
@@ -6822,10 +7071,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 	int right_root_level;
 	int left_level;
 	int right_level;
-	int left_end_reached;
-	int right_end_reached;
-	int advance_left;
-	int advance_right;
+	int left_end_reached = 0;
+	int right_end_reached = 0;
+	int advance_left = 0;
+	int advance_right = 0;
 	u64 left_blockptr;
 	u64 right_blockptr;
 	u64 left_gen;
@@ -6893,12 +7142,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 	down_read(&fs_info->commit_root_sem);
 	left_level = btrfs_header_level(left_root->commit_root);
 	left_root_level = left_level;
+	/*
+	 * We clone the root node of the send and parent roots to prevent races
+	 * with snapshot creation of these roots. Snapshot creation COWs the
+	 * root node of a tree, so after the transaction is committed the old
+	 * extent can be reallocated while this send operation is still ongoing.
+	 * So we clone them, under the commit root semaphore, to be race free.
+	 */
 	left_path->nodes[left_level] =
 			btrfs_clone_extent_buffer(left_root->commit_root);
 	if (!left_path->nodes[left_level]) {
-		up_read(&fs_info->commit_root_sem);
 		ret = -ENOMEM;
-		goto out;
+		goto out_unlock;
 	}
 
 	right_level = btrfs_header_level(right_root->commit_root);
@@ -6906,9 +7161,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 	right_path->nodes[right_level] =
 			btrfs_clone_extent_buffer(right_root->commit_root);
 	if (!right_path->nodes[right_level]) {
-		up_read(&fs_info->commit_root_sem);
 		ret = -ENOMEM;
-		goto out;
+		goto out_unlock;
 	}
 	/*
 	 * Our right root is the parent root, while the left root is the "send"
@@ -6918,7 +7172,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 	 * will need to read them at some point.
 	 */
 	reada_min_gen = btrfs_header_generation(right_root->commit_root);
-	up_read(&fs_info->commit_root_sem);
 
 	if (left_level == 0)
 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -6933,11 +7186,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
 				&right_key, right_path->slots[right_level]);
 
-	left_end_reached = right_end_reached = 0;
-	advance_left = advance_right = 0;
+	sctx->last_reloc_trans = fs_info->last_reloc_trans;
 
 	while (1) {
-		cond_resched();
+		if (need_resched() ||
+		    rwsem_is_contended(&fs_info->commit_root_sem)) {
+			up_read(&fs_info->commit_root_sem);
+			cond_resched();
+			down_read(&fs_info->commit_root_sem);
+		}
+
+		if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+			ret = restart_after_relocation(left_path, right_path,
+						       &left_key, &right_key,
+						       left_level, right_level,
+						       sctx);
+			if (ret < 0)
+				goto out_unlock;
+			sctx->last_reloc_trans = fs_info->last_reloc_trans;
+		}
+
 		if (advance_left && !left_end_reached) {
 			ret = tree_advance(left_path, &left_level,
 					left_root_level,
@@ -6946,7 +7214,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 			if (ret == -1)
 				left_end_reached = ADVANCE;
 			else if (ret < 0)
-				goto out;
+				goto out_unlock;
 			advance_left = 0;
 		}
 		if (advance_right && !right_end_reached) {
@@ -6957,54 +7225,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 			if (ret == -1)
 				right_end_reached = ADVANCE;
 			else if (ret < 0)
-				goto out;
+				goto out_unlock;
 			advance_right = 0;
 		}
 
 		if (left_end_reached && right_end_reached) {
 			ret = 0;
-			goto out;
+			goto out_unlock;
 		} else if (left_end_reached) {
 			if (right_level == 0) {
+				up_read(&fs_info->commit_root_sem);
 				ret = changed_cb(left_path, right_path,
 						&right_key,
 						BTRFS_COMPARE_TREE_DELETED,
 						sctx);
 				if (ret < 0)
 					goto out;
+				down_read(&fs_info->commit_root_sem);
 			}
 			advance_right = ADVANCE;
 			continue;
 		} else if (right_end_reached) {
 			if (left_level == 0) {
+				up_read(&fs_info->commit_root_sem);
 				ret = changed_cb(left_path, right_path,
 						&left_key,
 						BTRFS_COMPARE_TREE_NEW,
 						sctx);
 				if (ret < 0)
 					goto out;
+				down_read(&fs_info->commit_root_sem);
 			}
 			advance_left = ADVANCE;
 			continue;
 		}
 
 		if (left_level == 0 && right_level == 0) {
+			up_read(&fs_info->commit_root_sem);
 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
 			if (cmp < 0) {
 				ret = changed_cb(left_path, right_path,
 						&left_key,
 						BTRFS_COMPARE_TREE_NEW,
 						sctx);
-				if (ret < 0)
-					goto out;
 				advance_left = ADVANCE;
 			} else if (cmp > 0) {
 				ret = changed_cb(left_path, right_path,
 						&right_key,
 						BTRFS_COMPARE_TREE_DELETED,
 						sctx);
-				if (ret < 0)
-					goto out;
 				advance_right = ADVANCE;
 			} else {
 				enum btrfs_compare_tree_result result;
@@ -7018,11 +7287,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 					result = BTRFS_COMPARE_TREE_SAME;
 				ret = changed_cb(left_path, right_path,
 						 &left_key, result, sctx);
-				if (ret < 0)
-					goto out;
 				advance_left = ADVANCE;
 				advance_right = ADVANCE;
 			}
+
+			if (ret < 0)
+				goto out;
+			down_read(&fs_info->commit_root_sem);
 		} else if (left_level == right_level) {
 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
 			if (cmp < 0) {
@@ -7062,6 +7333,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 		}
 	}
 
+out_unlock:
+	up_read(&fs_info->commit_root_sem);
 out:
 	btrfs_free_path(left_path);
 	btrfs_free_path(right_path);
@@ -7411,21 +7684,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
 	if (ret)
 		goto out;
 
-	spin_lock(&fs_info->send_reloc_lock);
-	if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
-		spin_unlock(&fs_info->send_reloc_lock);
-		btrfs_warn_rl(fs_info,
-		"cannot run send because a relocation operation is in progress");
-		ret = -EAGAIN;
-		goto out;
-	}
-	fs_info->send_in_progress++;
-	spin_unlock(&fs_info->send_reloc_lock);
-
 	ret = send_subvol(sctx);
-	spin_lock(&fs_info->send_reloc_lock);
-	fs_info->send_in_progress--;
-	spin_unlock(&fs_info->send_reloc_lock);
 	if (ret < 0)
 		goto out;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5eadb17b971b..03de89b45f27 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -169,6 +169,10 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
 	ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
 
 	down_write(&fs_info->commit_root_sem);
+
+	if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+		fs_info->last_reloc_trans = trans->transid;
+
 	list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
 				 dirty_list) {
 		list_del_init(&root->dirty_list);
-- 
cgit v1.2.3-59-g8ed1b


From efc0e69c2feab8efcdbb9efdb9aae892d11b1545 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 25 Nov 2021 11:14:41 +0200
Subject: btrfs: introduce exclusive operation BALANCE_PAUSED state

Current set of exclusive operation states is not sufficient to handle
all practical use cases. In particular there is a need to be able to add
a device to a filesystem that have paused balance. Currently there is no
way to distinguish between a running and a paused balance. Fix this by
introducing BTRFS_EXCLOP_BALANCE_PAUSED which is going to be set in 2
occasions:

1. When a filesystem is mounted with skip_balance and there is an
   unfinished balance it will now be into BALANCE_PAUSED instead of
   simply BALANCE state.

2. When a running balance is paused.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  4 ++++
 fs/btrfs/ioctl.c   | 24 ++++++++++++++++++++++++
 fs/btrfs/volumes.c | 10 ++++++++--
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0c92158cb50c..4ea7157ca03c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -611,6 +611,7 @@ enum {
  */
 enum btrfs_exclusive_operation {
 	BTRFS_EXCLOP_NONE,
+	BTRFS_EXCLOP_BALANCE_PAUSED,
 	BTRFS_EXCLOP_BALANCE,
 	BTRFS_EXCLOP_DEV_ADD,
 	BTRFS_EXCLOP_DEV_REMOVE,
@@ -3321,6 +3322,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
 				 enum btrfs_exclusive_operation type);
 void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
 void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+			  enum btrfs_exclusive_operation op);
+
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index af77098fdf35..6752e4b8b351 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -414,6 +414,29 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
 	sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
 }
 
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+			  enum btrfs_exclusive_operation op)
+{
+	switch (op) {
+	case BTRFS_EXCLOP_BALANCE_PAUSED:
+		spin_lock(&fs_info->super_lock);
+		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+		       fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD);
+		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+		spin_unlock(&fs_info->super_lock);
+		break;
+	case BTRFS_EXCLOP_BALANCE:
+		spin_lock(&fs_info->super_lock);
+		ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+		fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+		spin_unlock(&fs_info->super_lock);
+		break;
+	default:
+		btrfs_warn(fs_info,
+			"invalid exclop balance operation %d requested", op);
+	}
+}
+
 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 {
 	struct inode *inode = file_inode(file);
@@ -4056,6 +4079,7 @@ locked:
 			spin_lock(&fs_info->balance_lock);
 			bctl->flags |= BTRFS_BALANCE_RESUME;
 			spin_unlock(&fs_info->balance_lock);
+			btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
 
 			goto do_balance;
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4f77268619f3..9db9d66b5470 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4394,8 +4394,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
 	ret = __btrfs_balance(fs_info);
 
 	mutex_lock(&fs_info->balance_mutex);
-	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
+	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
 		btrfs_info(fs_info, "balance: paused");
+		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+	}
 	/*
 	 * Balance can be canceled by:
 	 *
@@ -4471,6 +4473,10 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 		return 0;
 	}
 
+	spin_lock(&fs_info->super_lock);
+	ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+	fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+	spin_unlock(&fs_info->super_lock);
 	/*
 	 * A ro->rw remount sequence should continue with the paused balance
 	 * regardless of who pauses it, system or the user as of now, so set
@@ -4539,7 +4545,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
 	 * is in a paused state and must have fs_info::balance_ctl properly
 	 * set up.
 	 */
-	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
 		btrfs_warn(fs_info,
 	"balance: cannot set exclusive op status, resume manually");
 
-- 
cgit v1.2.3-59-g8ed1b


From 621a1ee1d399c77dd4604aeab3e0f6319f649dbc Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 25 Nov 2021 11:14:42 +0200
Subject: btrfs: make device add compatible with paused balance in
 btrfs_exclop_start_try_lock

This is needed to enable device add to work in cases when a file system
has been mounted with 'skip_balance' mount option.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6752e4b8b351..f706287942c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -387,6 +387,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
  *
  * Compatibility:
  * - the same type is already running
+ * - when trying to add a device and balance has been paused
  * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
  *   must check the condition first that would allow none -> @type
  */
@@ -394,7 +395,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
 				 enum btrfs_exclusive_operation type)
 {
 	spin_lock(&fs_info->super_lock);
-	if (fs_info->exclusive_operation == type)
+	if (fs_info->exclusive_operation == type ||
+	    (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+	     type == BTRFS_EXCLOP_DEV_ADD))
 		return true;
 
 	spin_unlock(&fs_info->super_lock);
-- 
cgit v1.2.3-59-g8ed1b


From a174c0a2e857081195db6888323802f0fae793ef Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 25 Nov 2021 11:14:43 +0200
Subject: btrfs: allow device add if balance is paused

Currently paused balance precludes adding a device since they are both
considered exclusive ops and we can have at most one running at a time.
This is problematic in case a filesystem encounters an ENOSPC situation
while balance is running, in this case the only thing the user can do
is mount the fs with "skip_balance" which pauses balance and delete some
data to free up space for balance. However, it should be possible to add
a new device when balance is paused.

Fix this by allowing device add to proceed when balance is paused.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f706287942c5..7565b667f4fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3174,13 +3174,25 @@ out:
 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
+	bool restore_op = false;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
-		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
+		if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
+			return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+
+		/*
+		 * We can do the device add because we have a paused balanced,
+		 * change the exclusive op type and remember we should bring
+		 * back the paused balance
+		 */
+		fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
+		btrfs_exclop_start_unlock(fs_info);
+		restore_op = true;
+	}
 
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
@@ -3196,7 +3208,10 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 
 	kfree(vol_args);
 out:
-	btrfs_exclop_finish(fs_info);
+	if (restore_op)
+		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+	else
+		btrfs_exclop_finish(fs_info);
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 120de408e4b97504a2d9b5ca534b383de2c73d49 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 24 Nov 2021 14:14:24 -0500
Subject: btrfs: check the root node for uptodate before returning it

Now that we clear the extent buffer uptodate if we fail to write it out
we need to check to see if our root node is uptodate before we search
down it.  Otherwise we could return stale data (or potentially corrupt
data that was caught by the write verification step) and think that the
path is OK to search down.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e02ac51c664..b54ea94a7df4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1569,12 +1569,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
 							int write_lock_level)
 {
 	struct extent_buffer *b;
-	int root_lock;
+	int root_lock = 0;
 	int level = 0;
 
-	/* We try very hard to do read locks on the root */
-	root_lock = BTRFS_READ_LOCK;
-
 	if (p->search_commit_root) {
 		b = root->commit_root;
 		atomic_inc(&b->refs);
@@ -1594,6 +1591,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
 		goto out;
 	}
 
+	/* We try very hard to do read locks on the root */
+	root_lock = BTRFS_READ_LOCK;
+
 	/*
 	 * If the level is set to maximum, we can skip trying to get the read
 	 * lock.
@@ -1620,6 +1620,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
 	level = btrfs_header_level(b);
 
 out:
+	/*
+	 * The root may have failed to write out at some point, and thus is no
+	 * longer valid, return an error in this case.
+	 */
+	if (!extent_buffer_uptodate(b)) {
+		if (root_lock)
+			btrfs_tree_unlock_rw(b, root_lock);
+		free_extent_buffer(b);
+		return ERR_PTR(-EIO);
+	}
+
 	p->nodes[level] = b;
 	if (!p->skip_locking)
 		p->locks[level] = root_lock;
-- 
cgit v1.2.3-59-g8ed1b


From fb81212c07b1d1870fefdf883d2d4e8a9386308b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 2 Dec 2021 10:30:35 +0000
Subject: btrfs: allow generic_bin_search() to take low boundary as an argument

Right now generic_bin_search() always uses a low boundary slot of 0, but
in the next patch we'll want to often skip slot 0 when searching for a
key. So make generic_bin_search() have the low boundary slot specified
as an argument, and move the check for the extent buffer level from
btrfs_bin_search() to generic_bin_search() to avoid adding another
wrapper around generic_bin_search().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b54ea94a7df4..0af2429469f1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -726,21 +726,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 }
 
 /*
- * search for key in the extent_buffer.  The items start at offset p,
- * and they are item_size apart.
+ * Search for a key in the given extent_buffer.
  *
- * the slot in the array is returned via slot, and it points to
- * the place where you would insert key if it is not found in
- * the array.
+ * The lower boundary for the search is specified by the slot number @low. Use a
+ * value of 0 to search over the whole extent buffer.
  *
- * Slot may point to total number of items if the key is bigger than
- * all of the keys
+ * The slot in the extent buffer is returned via @slot. If the key exists in the
+ * extent buffer, then @slot will point to the slot where the key is, otherwise
+ * it points to the slot where you would insert the key.
+ *
+ * Slot may point to the total number of items (i.e. one position beyond the last
+ * key) if the key is bigger than the last key in the extent buffer.
  */
-static noinline int generic_bin_search(struct extent_buffer *eb,
-				       unsigned long p, int item_size,
+static noinline int generic_bin_search(struct extent_buffer *eb, int low,
 				       const struct btrfs_key *key, int *slot)
 {
-	int low = 0;
+	unsigned long p;
+	int item_size;
 	int high = btrfs_header_nritems(eb);
 	int ret;
 	const int key_size = sizeof(struct btrfs_disk_key);
@@ -753,6 +755,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 		return -EINVAL;
 	}
 
+	if (btrfs_header_level(eb) == 0) {
+		p = offsetof(struct btrfs_leaf, items);
+		item_size = sizeof(struct btrfs_item);
+	} else {
+		p = offsetof(struct btrfs_node, ptrs);
+		item_size = sizeof(struct btrfs_key_ptr);
+	}
+
 	while (low < high) {
 		unsigned long oip;
 		unsigned long offset;
@@ -791,20 +801,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 }
 
 /*
- * simple bin_search frontend that does the right thing for
- * leaves vs nodes
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
  */
 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
 		     int *slot)
 {
-	if (btrfs_header_level(eb) == 0)
-		return generic_bin_search(eb,
-					  offsetof(struct btrfs_leaf, items),
-					  sizeof(struct btrfs_item), key, slot);
-	else
-		return generic_bin_search(eb,
-					  offsetof(struct btrfs_node, ptrs),
-					  sizeof(struct btrfs_key_ptr), key, slot);
+	return generic_bin_search(eb, 0, key, slot);
 }
 
 static void root_add_used(struct btrfs_root *root, u32 size)
-- 
cgit v1.2.3-59-g8ed1b


From e2e58d0f8dc55533c24fc7b3e101092f571b4a43 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 2 Dec 2021 10:30:36 +0000
Subject: btrfs: try to unlock parent nodes earlier when inserting a key

When inserting a new key, we release the write lock on the leaf's parent
only after doing the binary search on the leaf. This is because if the
key ends up at slot 0, we will have to update the key at slot 0 of the
parent node. The same reasoning applies to any other upper level nodes
when their slot is 0. We also need to keep the parent locked in case the
leaf does not have enough free space to insert the new key/item, because
in that case we will split the leaf and we will need to add a new key to
the parent due to a new leaf resulting from the split operation.

However if the leaf has enough space for the new key and the key does not
end up at slot 0 of the leaf we could release our write lock on the parent
before doing the binary search on the leaf to figure out the destination
slot. That leads to reducing the amount of time other tasks are blocked
waiting to lock the parent, therefore increasing parallelism when there
are other tasks that are trying to access other leaves accessible through
the same parent. This also applies to other upper nodes besides the
immediate parent, when their slot is 0, since we keep locks on them until
we figure out if the leaf slot is slot 0 or not.

In fact, having the key ending at up slot 0 when is rare. Typically it
only happens when the key is less than or equals to the smallest, the
"left most", key of the entire btree, during a split attempt when we try
to push to the right sibling leaf or when the caller just wants to update
the item of an existing key. It's also very common that a leaf has enough
space to insert a new key, since after a split we move about half of the
keys from one into the new leaf.

So unlock the parent, and any other upper level nodes, when during a key
insertion we notice the key is greater then the first key in the leaf and
the leaf has enough free space. After unlocking the upper level nodes, do
the binary search using a low boundary of slot 1 and not slot 0, to figure
out the slot where the key will be inserted (or where the key already is
in case it exists and the caller wants to modify its item data).
This extra comparison, with the first key, is cheap and the key is very
likely already in a cache line because it immediately follows the header
of the extent buffer and we have recently read the level field of the
header (which in fact is the last field of the header).

The following fs_mark test was run on a non-debug kernel (debian's default
kernel config), with a 12 cores intel CPU, and using a NVMe device:

  $ cat run-fsmark.sh
  #!/bin/bash

  DEV=/dev/nvme0n1
  MNT=/mnt/nvme0n1
  MOUNT_OPTIONS="-o ssd"
  MKFS_OPTIONS="-O no-holes -R free-space-tree"
  FILES=100000
  THREADS=$(nproc --all)
  FILE_SIZE=0

  echo "performance" | \
	tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  mkfs.btrfs -f $MKFS_OPTIONS $DEV
  mount $MOUNT_OPTIONS $DEV $MNT

  OPTS="-S 0 -L 10 -n $FILES -s $FILE_SIZE -t $THREADS -k"
  for ((i = 1; i <= $THREADS; i++)); do
      OPTS="$OPTS -d $MNT/d$i"
  done

  fs_mark $OPTS

  umount $MNT

Before this change:

FSUse%        Count         Size    Files/sec     App Overhead
     0      1200000            0     165273.6          5958381
     0      2400000            0     190938.3          6284477
     0      3600000            0     181429.1          6044059
     0      4800000            0     173979.2          6223418
     0      6000000            0     139288.0          6384560
     0      7200000            0     163000.4          6520083
     1      8400000            0      57799.2          5388544
     1      9600000            0      66461.6          5552969
     2     10800000            0      49593.5          5163675
     2     12000000            0      57672.1          4889398

After this change:

FSUse%        Count         Size    Files/sec            App Overhead
     0      1200000            0     167987.3 (+1.6%)         6272730
     0      2400000            0     198563.9 (+4.0%)         6048847
     0      3600000            0     197436.6 (+8.8%)         6163637
     0      4800000            0     202880.7 (+16.6%)        6371771
     1      6000000            0     167275.9 (+20.1%)        6556733
     1      7200000            0     204051.2 (+25.2%)        6817091
     1      8400000            0      69622.8 (+20.5%)        5525675
     1      9600000            0      69384.5 (+4.4%)         5700723
     1     10800000            0      61454.1 (+23.9%)        5363754
     3     12000000            0      61908.7 (+7.3%)         5370196

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 118 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0af2429469f1..f12172cb6c35 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1680,6 +1680,27 @@ static int finish_need_commit_sem_search(struct btrfs_path *path)
 	return 0;
 }
 
+static inline int search_for_key_slot(struct extent_buffer *eb,
+				      int search_low_slot,
+				      const struct btrfs_key *key,
+				      int prev_cmp,
+				      int *slot)
+{
+	/*
+	 * If a previous call to btrfs_bin_search() on a parent node returned an
+	 * exact match (prev_cmp == 0), we can safely assume the target key will
+	 * always be at slot 0 on lower levels, since each key pointer
+	 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
+	 * subtree it points to. Thus we can skip searching lower levels.
+	 */
+	if (prev_cmp == 0) {
+		*slot = 0;
+		return 0;
+	}
+
+	return generic_bin_search(eb, search_low_slot, key, slot);
+}
+
 /*
  * btrfs_search_slot - look for a key in a tree and perform necessary
  * modifications to preserve tree invariants.
@@ -1840,25 +1861,98 @@ cow_done:
 			}
 		}
 
-		/*
-		 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
-		 * we can safely assume the target key will always be in slot 0
-		 * on lower levels due to the invariants BTRFS' btree provides,
-		 * namely that a btrfs_key_ptr entry always points to the
-		 * lowest key in the child node, thus we can skip searching
-		 * lower levels
-		 */
-		if (prev_cmp == 0) {
-			slot = 0;
-			ret = 0;
-		} else {
-			ret = btrfs_bin_search(b, key, &slot);
-			prev_cmp = ret;
+		if (level == 0) {
+			int leaf_free_space = 0;
+			int search_low_slot = 0;
+
+			/*
+			 * If we are doing an insertion, the leaf has enough free
+			 * space and the destination slot for the key is not slot
+			 * 0, then we can unlock our write lock on the parent, and
+			 * any other upper nodes, before doing the binary search
+			 * on the leaf (with search_for_key_slot()), allowing other
+			 * tasks to lock the parent and any other upper nodes.
+			 */
+			if (ins_len > 0) {
+				struct btrfs_disk_key first_key;
+
+				/*
+				 * Cache the leaf free space, since we will need it
+				 * later and it will not change until then.
+				 */
+				leaf_free_space = btrfs_leaf_free_space(b);
+
+				/*
+				 * !p->locks[1] means we have a single node tree,
+				 * the leaf is the root of the tree.
+				 */
+				if (!p->locks[1] || leaf_free_space < ins_len)
+					goto leaf_search;
+
+				ASSERT(btrfs_header_nritems(b) > 0);
+				btrfs_item_key(b, &first_key, 0);
+
+				/*
+				 * Doing the extra comparison with the first key
+				 * is cheap, taking into account that the first
+				 * key is very likely already in a cache line
+				 * because it immediately follows the extent
+				 * buffer's header and we have recently accessed
+				 * the header's level field.
+				 */
+				ret = comp_keys(&first_key, key);
+				if (ret < 0) {
+					/*
+					 * The first key is smaller than the key
+					 * we want to insert, so we are safe to
+					 * unlock all upper nodes and we have to
+					 * do the binary search.
+					 *
+					 * We do use btrfs_unlock_up_safe() and
+					 * not unlock_up() because the later does
+					 * not unlock nodes with a slot of 0.
+					 * We can safely unlock any node even if
+					 * its slot is 0 since in this case the
+					 * key does not end up at slot 0 of the
+					 * leaf and there's also no need to split
+					 * the leaf.
+					 */
+					btrfs_unlock_up_safe(p, 1);
+					search_low_slot = 1;
+				} else {
+					/*
+					 * The first key is >= then the key we
+					 * want to insert, so we can skip the
+					 * binary search as the target key will
+					 * be at slot 0.
+					 *
+					 * We can not unlock upper nodes when
+					 * the key is less than the first key,
+					 * because we will need to update the key
+					 * at slot 0 of the parent node and
+					 * possibly of other upper nodes too.
+					 * If the key matches the first key, then
+					 * we can unlock all the upper nodes,
+					 * using btrfs_unlock_up_safe() instead
+					 * of unlock_up() as stated above.
+					 */
+					if (ret == 0)
+						btrfs_unlock_up_safe(p, 1);
+					slot = 0;
+					/*
+					 * ret is already 0 or 1, matching the
+					 * result of a btrfs_bin_search() call,
+					 * so there is no need to adjust it.
+					 */
+					goto skip_leaf_search;
+				}
+			}
+leaf_search:
+			ret = search_for_key_slot(b, search_low_slot, key,
+						  prev_cmp, &slot);
 			if (ret < 0)
 				goto done;
-		}
-
-		if (level == 0) {
+skip_leaf_search:
 			p->slots[level] = slot;
 			/*
 			 * Item key already exists. In this case, if we are
@@ -1874,8 +1968,7 @@ cow_done:
 				ASSERT(ins_len >= sizeof(struct btrfs_item));
 				ins_len -= sizeof(struct btrfs_item);
 			}
-			if (ins_len > 0 &&
-			    btrfs_leaf_free_space(b) < ins_len) {
+			if (ins_len > 0 && leaf_free_space < ins_len) {
 				if (write_lock_level < 1) {
 					write_lock_level = 1;
 					btrfs_release_path(p);
@@ -1896,6 +1989,12 @@ cow_done:
 					  min_write_lock_level, NULL);
 			goto done;
 		}
+
+		ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
+		if (ret < 0)
+			goto done;
+		prev_cmp = ret;
+
 		if (ret && slot > 0) {
 			dec = 1;
 			slot--;
-- 
cgit v1.2.3-59-g8ed1b


From e5e1c1741b3de3f8d06fe4b700d83709a7da0610 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 2 Dec 2021 10:30:37 +0000
Subject: btrfs: remove useless condition check before splitting leaf

When inserting a key, we check if the write_lock_level is less than 1,
and if so we set it to 1, release the path and retry the tree traversal.

However that is unnecessary, because when ins_len is greater than 0, we
know that write_lock_level can never be less than 1.

The logic to retry is also buggy, because in case ins_len was decremented,
due to an exact key match and the search is not meant for item extension
(path->search_for_extension is 0), we retry without incrementing ins_len,
which would make the next retry decrement it again by the same amount.

So remove the check for write_lock_level being less than 1 and add an
assertion to assert it's always >= 1.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f12172cb6c35..0e81f1847941 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1969,11 +1969,7 @@ skip_leaf_search:
 				ins_len -= sizeof(struct btrfs_item);
 			}
 			if (ins_len > 0 && leaf_free_space < ins_len) {
-				if (write_lock_level < 1) {
-					write_lock_level = 1;
-					btrfs_release_path(p);
-					goto again;
-				}
+				ASSERT(write_lock_level >= 1);
 
 				err = split_leaf(trans, root, key,
 						 p, ins_len, ret == 0);
-- 
cgit v1.2.3-59-g8ed1b


From 109324cfda067b84b948002584849a02dd0a6641 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 2 Dec 2021 10:30:38 +0000
Subject: btrfs: move leaf search logic out of btrfs_search_slot()

There's quite a significant amount of code for doing the key search for a
leaf at btrfs_search_slot(), with a couple labels and gotos in it, plus
btrfs_search_slot() is already big enough.

So move the logic that does the key search on a leaf into a new helper
function. This makes it better organized, removing the need for the labels
and the gotos, as well as reducing the indentation level and the size of
btrfs_search_slot().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 244 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 128 insertions(+), 116 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0e81f1847941..ae83f491a9e7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1701,6 +1701,132 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
 	return generic_bin_search(eb, search_low_slot, key, slot);
 }
 
+static int search_leaf(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       const struct btrfs_key *key,
+		       struct btrfs_path *path,
+		       int ins_len,
+		       int prev_cmp)
+{
+	struct extent_buffer *leaf = path->nodes[0];
+	int leaf_free_space = -1;
+	int search_low_slot = 0;
+	int ret;
+	bool do_bin_search = true;
+
+	/*
+	 * If we are doing an insertion, the leaf has enough free space and the
+	 * destination slot for the key is not slot 0, then we can unlock our
+	 * write lock on the parent, and any other upper nodes, before doing the
+	 * binary search on the leaf (with search_for_key_slot()), allowing other
+	 * tasks to lock the parent and any other upper nodes.
+	 */
+	if (ins_len > 0) {
+		/*
+		 * Cache the leaf free space, since we will need it later and it
+		 * will not change until then.
+		 */
+		leaf_free_space = btrfs_leaf_free_space(leaf);
+
+		/*
+		 * !path->locks[1] means we have a single node tree, the leaf is
+		 * the root of the tree.
+		 */
+		if (path->locks[1] && leaf_free_space >= ins_len) {
+			struct btrfs_disk_key first_key;
+
+			ASSERT(btrfs_header_nritems(leaf) > 0);
+			btrfs_item_key(leaf, &first_key, 0);
+
+			/*
+			 * Doing the extra comparison with the first key is cheap,
+			 * taking into account that the first key is very likely
+			 * already in a cache line because it immediately follows
+			 * the extent buffer's header and we have recently accessed
+			 * the header's level field.
+			 */
+			ret = comp_keys(&first_key, key);
+			if (ret < 0) {
+				/*
+				 * The first key is smaller than the key we want
+				 * to insert, so we are safe to unlock all upper
+				 * nodes and we have to do the binary search.
+				 *
+				 * We do use btrfs_unlock_up_safe() and not
+				 * unlock_up() because the later does not unlock
+				 * nodes with a slot of 0 - we can safely unlock
+				 * any node even if its slot is 0 since in this
+				 * case the key does not end up at slot 0 of the
+				 * leaf and there's no need to split the leaf.
+				 */
+				btrfs_unlock_up_safe(path, 1);
+				search_low_slot = 1;
+			} else {
+				/*
+				 * The first key is >= then the key we want to
+				 * insert, so we can skip the binary search as
+				 * the target key will be at slot 0.
+				 *
+				 * We can not unlock upper nodes when the key is
+				 * less than the first key, because we will need
+				 * to update the key at slot 0 of the parent node
+				 * and possibly of other upper nodes too.
+				 * If the key matches the first key, then we can
+				 * unlock all the upper nodes, using
+				 * btrfs_unlock_up_safe() instead of unlock_up()
+				 * as stated above.
+				 */
+				if (ret == 0)
+					btrfs_unlock_up_safe(path, 1);
+				/*
+				 * ret is already 0 or 1, matching the result of
+				 * a btrfs_bin_search() call, so there is no need
+				 * to adjust it.
+				 */
+				do_bin_search = false;
+				path->slots[0] = 0;
+			}
+		}
+	}
+
+	if (do_bin_search) {
+		ret = search_for_key_slot(leaf, search_low_slot, key,
+					  prev_cmp, &path->slots[0]);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (ins_len > 0) {
+		/*
+		 * Item key already exists. In this case, if we are allowed to
+		 * insert the item (for example, in dir_item case, item key
+		 * collision is allowed), it will be merged with the original
+		 * item. Only the item size grows, no new btrfs item will be
+		 * added. If search_for_extension is not set, ins_len already
+		 * accounts the size btrfs_item, deduct it here so leaf space
+		 * check will be correct.
+		 */
+		if (ret == 0 && !path->search_for_extension) {
+			ASSERT(ins_len >= sizeof(struct btrfs_item));
+			ins_len -= sizeof(struct btrfs_item);
+		}
+
+		ASSERT(leaf_free_space >= 0);
+
+		if (leaf_free_space < ins_len) {
+			int err;
+
+			err = split_leaf(trans, root, key, path, ins_len,
+					 (ret == 0));
+			BUG_ON(err > 0);
+			if (err)
+				ret = err;
+		}
+	}
+
+	return ret;
+}
+
 /*
  * btrfs_search_slot - look for a key in a tree and perform necessary
  * modifications to preserve tree invariants.
@@ -1862,124 +1988,10 @@ cow_done:
 		}
 
 		if (level == 0) {
-			int leaf_free_space = 0;
-			int search_low_slot = 0;
-
-			/*
-			 * If we are doing an insertion, the leaf has enough free
-			 * space and the destination slot for the key is not slot
-			 * 0, then we can unlock our write lock on the parent, and
-			 * any other upper nodes, before doing the binary search
-			 * on the leaf (with search_for_key_slot()), allowing other
-			 * tasks to lock the parent and any other upper nodes.
-			 */
-			if (ins_len > 0) {
-				struct btrfs_disk_key first_key;
-
-				/*
-				 * Cache the leaf free space, since we will need it
-				 * later and it will not change until then.
-				 */
-				leaf_free_space = btrfs_leaf_free_space(b);
-
-				/*
-				 * !p->locks[1] means we have a single node tree,
-				 * the leaf is the root of the tree.
-				 */
-				if (!p->locks[1] || leaf_free_space < ins_len)
-					goto leaf_search;
-
-				ASSERT(btrfs_header_nritems(b) > 0);
-				btrfs_item_key(b, &first_key, 0);
-
-				/*
-				 * Doing the extra comparison with the first key
-				 * is cheap, taking into account that the first
-				 * key is very likely already in a cache line
-				 * because it immediately follows the extent
-				 * buffer's header and we have recently accessed
-				 * the header's level field.
-				 */
-				ret = comp_keys(&first_key, key);
-				if (ret < 0) {
-					/*
-					 * The first key is smaller than the key
-					 * we want to insert, so we are safe to
-					 * unlock all upper nodes and we have to
-					 * do the binary search.
-					 *
-					 * We do use btrfs_unlock_up_safe() and
-					 * not unlock_up() because the later does
-					 * not unlock nodes with a slot of 0.
-					 * We can safely unlock any node even if
-					 * its slot is 0 since in this case the
-					 * key does not end up at slot 0 of the
-					 * leaf and there's also no need to split
-					 * the leaf.
-					 */
-					btrfs_unlock_up_safe(p, 1);
-					search_low_slot = 1;
-				} else {
-					/*
-					 * The first key is >= then the key we
-					 * want to insert, so we can skip the
-					 * binary search as the target key will
-					 * be at slot 0.
-					 *
-					 * We can not unlock upper nodes when
-					 * the key is less than the first key,
-					 * because we will need to update the key
-					 * at slot 0 of the parent node and
-					 * possibly of other upper nodes too.
-					 * If the key matches the first key, then
-					 * we can unlock all the upper nodes,
-					 * using btrfs_unlock_up_safe() instead
-					 * of unlock_up() as stated above.
-					 */
-					if (ret == 0)
-						btrfs_unlock_up_safe(p, 1);
-					slot = 0;
-					/*
-					 * ret is already 0 or 1, matching the
-					 * result of a btrfs_bin_search() call,
-					 * so there is no need to adjust it.
-					 */
-					goto skip_leaf_search;
-				}
-			}
-leaf_search:
-			ret = search_for_key_slot(b, search_low_slot, key,
-						  prev_cmp, &slot);
-			if (ret < 0)
-				goto done;
-skip_leaf_search:
-			p->slots[level] = slot;
-			/*
-			 * Item key already exists. In this case, if we are
-			 * allowed to insert the item (for example, in dir_item
-			 * case, item key collision is allowed), it will be
-			 * merged with the original item. Only the item size
-			 * grows, no new btrfs item will be added. If
-			 * search_for_extension is not set, ins_len already
-			 * accounts the size btrfs_item, deduct it here so leaf
-			 * space check will be correct.
-			 */
-			if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
-				ASSERT(ins_len >= sizeof(struct btrfs_item));
-				ins_len -= sizeof(struct btrfs_item);
-			}
-			if (ins_len > 0 && leaf_free_space < ins_len) {
+			if (ins_len > 0)
 				ASSERT(write_lock_level >= 1);
 
-				err = split_leaf(trans, root, key,
-						 p, ins_len, ret == 0);
-
-				BUG_ON(err > 0);
-				if (err) {
-					ret = err;
-					goto done;
-				}
-			}
+			ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
 			if (!p->search_for_split)
 				unlock_up(p, level, lowest_unlock,
 					  min_write_lock_level, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From bb8e9a608055e016aace6db269432ba52a57dcc1 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 2 Dec 2021 10:30:39 +0000
Subject: btrfs: remove BUG_ON() after splitting leaf

After calling split_leaf() we BUG_ON() if the returned value is greater
than zero. However split_leaf() only returns 0, in case of success, or a
negative value in case of an error.

The reason for the BUG_ON() is that if we ever get a positive return
value from split_leaf(), we can not simply propagate it to the callers
of btrfs_search_slot(), as that would be interpreted as "key not found"
and not as an error. That means it could result in callers ending up
causing some potential silent corruption.

So change the BUG_ON() to an ASSERT(), and in case assertions are
disabled, produce a warning and set the return value to an error, to make
it not possible to get into a silent corruption and having the error not
noticed.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ae83f491a9e7..36fc76678e30 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1818,7 +1818,9 @@ static int search_leaf(struct btrfs_trans_handle *trans,
 
 			err = split_leaf(trans, root, key, path, ins_len,
 					 (ret == 0));
-			BUG_ON(err > 0);
+			ASSERT(err <= 0);
+			if (WARN_ON(err > 0))
+				err = -EUCLEAN;
 			if (err)
 				ret = err;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 727e60604f6a61b8e4330e8fe63a8cf0a067d29d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 2 Dec 2021 10:30:40 +0000
Subject: btrfs: remove stale comment about locking at btrfs_search_slot()

The comment refers to the old extent buffer locking code, where we used to
have custom locks that had blocking and spinning behaviour modes. That is
not the case anymore, since we have transitioned to rw semaphores, so the
comment does not offer any value anymore. Remove it.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 36fc76678e30..defc494a2d08 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1964,10 +1964,6 @@ again:
 		}
 cow_done:
 		p->nodes[level] = b;
-		/*
-		 * Leave path with blocking locks to avoid massive
-		 * lock context switch, this is made on purpose.
-		 */
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
-- 
cgit v1.2.3-59-g8ed1b


From 26c2c4540d6d5c85a22a857ccda304361f1afeaf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:03 -0500
Subject: btrfs: add an inode-item.h

We have a few helpers in inode-item.c, and I'm going to make a few
changes to how we do truncate in the future, so break out these
definitions into their own header file to trim down ctree.h some and
make it easier to do the work on truncate in the future.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h            | 30 ------------------------------
 fs/btrfs/delayed-inode.c    |  1 +
 fs/btrfs/free-space-cache.c |  1 +
 fs/btrfs/inode-item.c       |  1 +
 fs/btrfs/inode-item.h       | 45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/inode.c            |  1 +
 fs/btrfs/relocation.c       |  1 +
 fs/btrfs/tree-log.c         |  1 +
 8 files changed, 51 insertions(+), 30 deletions(-)
 create mode 100644 fs/btrfs/inode-item.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ea7157ca03c..04e480740438 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3131,36 +3131,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 offset);
 int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
 
-/* inode-item.c */
-int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid, u64 index);
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
-			     struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct btrfs_path *path,
-		       struct btrfs_key *location, int mod);
-
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_path *path,
-			  const char *name, int name_len,
-			  u64 inode_objectid, u64 ref_objectid, int ins_len,
-			  int cow);
-
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
-						   int slot, const char *name,
-						   int name_len);
-struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
-		struct extent_buffer *leaf, int slot, u64 ref_objectid,
-		const char *name, int name_len);
 /* file-item.c */
 struct btrfs_dio_private;
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6f134f2c5e68..748bf6b0d860 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -13,6 +13,7 @@
 #include "ctree.h"
 #include "qgroup.h"
 #include "locking.h"
+#include "inode-item.h"
 
 #define BTRFS_DELAYED_WRITEBACK		512
 #define BTRFS_DELAYED_BACKGROUND	128
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 132209ff2262..55e1be703a39 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,6 +23,7 @@
 #include "block-group.h"
 #include "discard.h"
 #include "subpage.h"
+#include "inode-item.h"
 
 #define BITS_PER_BITMAP		(PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_64K
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 56755ce9a907..72593a93c43c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -4,6 +4,7 @@
  */
 
 #include "ctree.h"
+#include "inode-item.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "print-tree.h"
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
new file mode 100644
index 000000000000..30476bcdf413
--- /dev/null
+++ b/fs/btrfs/inode-item.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_INODE_ITEM_H
+#define BTRFS_INODE_ITEM_H
+
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_key;
+struct btrfs_inode_extref;
+struct extent_buffer;
+
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod);
+
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(
+			  struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow);
+
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+						   int slot, const char *name,
+						   int name_len);
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+		struct extent_buffer *leaf, int slot, u64 ref_objectid,
+		const char *name, int name_len);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 504cf090fc88..6ccdcf76b02f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -54,6 +54,7 @@
 #include "space-info.h"
 #include "zoned.h"
 #include "subpage.h"
+#include "inode-item.h"
 
 struct btrfs_iget_args {
 	u64 ino;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 775aff5a2c26..f5465197996d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -26,6 +26,7 @@
 #include "misc.h"
 #include "subpage.h"
 #include "zoned.h"
+#include "inode-item.h"
 
 /*
  * Relocation overview
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7c4634ac511e..64b42f4b11f2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,6 +20,7 @@
 #include "block-group.h"
 #include "space-info.h"
 #include "zoned.h"
+#include "inode-item.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
-- 
cgit v1.2.3-59-g8ed1b


From 54f03ab1e19b04dea546f83ae70b3285bc61b9f8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:04 -0500
Subject: btrfs: move btrfs_truncate_inode_items to inode-item.c

This is an inode item related manipulation with a few vfs related
adjustments.  I'm going to remove the vfs related code from this helper
and simplify it a lot, but I want those changes to be easily seen via
git blame, so move this function now and then the simplification work
can be done.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h      |   4 -
 fs/btrfs/inode-item.c | 372 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/inode-item.h |  11 ++
 fs/btrfs/inode.c      | 391 +-------------------------------------------------
 4 files changed, 387 insertions(+), 391 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 04e480740438..459d00211181 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3190,10 +3190,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 			 int front);
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct btrfs_inode *inode, u64 new_size,
-			       u32 min_type, u64 *extents_found);
 
 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72593a93c43c..c43a3610f443 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -418,3 +418,375 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	return ret;
 }
+
+/*
+ * Remove inode items from a given root.
+ *
+ * @trans:		A transaction handle.
+ * @root:		The root from which to remove items.
+ * @inode:		The inode whose items we want to remove.
+ * @new_size:		The new i_size for the inode. This is only applicable when
+ *			@min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
+ * @min_type:		The minimum key type to remove. All keys with a type
+ *			greater than this value are removed and all keys with
+ *			this type are removed only if their offset is >= @new_size.
+ * @extents_found:	Output parameter that will contain the number of file
+ *			extent items that were removed or adjusted to the new
+ *			inode i_size. The caller is responsible for initializing
+ *			the counter. Also, it can be NULL if the caller does not
+ *			need this counter.
+ *
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_inode *inode,
+			       u64 new_size, u32 min_type,
+			       u64 *extents_found)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 extent_start = 0;
+	u64 extent_num_bytes = 0;
+	u64 extent_offset = 0;
+	u64 item_end = 0;
+	u64 last_size = new_size;
+	u32 found_type = (u8)-1;
+	int found_extent;
+	int del_item;
+	int pending_del_nr = 0;
+	int pending_del_slot = 0;
+	int extent_type = -1;
+	int ret;
+	u64 ino = btrfs_ino(inode);
+	u64 bytes_deleted = 0;
+	bool be_nice = false;
+	bool should_throttle = false;
+	const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+	struct extent_state *cached_state = NULL;
+
+	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+
+	/*
+	 * For non-free space inodes and non-shareable roots, we want to back
+	 * off from time to time.  This means all inodes in subvolume roots,
+	 * reloc roots, and data reloc roots.
+	 */
+	if (!btrfs_is_free_space_inode(inode) &&
+	    test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+		be_nice = true;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = READA_BACK;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
+				 &cached_state);
+
+		/*
+		 * We want to drop from the next block forward in case this
+		 * new size is not block aligned since we will be keeping the
+		 * last block of the extent just the way it is.
+		 */
+		btrfs_drop_extent_cache(inode, ALIGN(new_size,
+					fs_info->sectorsize),
+					(u64)-1, 0);
+	}
+
+	/*
+	 * This function is also used to drop the items in the log tree before
+	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+	 * it is used to drop the logged items. So we shouldn't kill the delayed
+	 * items.
+	 */
+	if (min_type == 0 && root == inode->root)
+		btrfs_kill_delayed_inode_items(inode);
+
+	key.objectid = ino;
+	key.offset = (u64)-1;
+	key.type = (u8)-1;
+
+search_again:
+	/*
+	 * With a 16K leaf size and 128MiB extents, you can actually queue up a
+	 * huge file in a single leaf.  Most of the time that bytes_deleted is
+	 * > 0, it will be huge by the time we get here
+	 */
+	if (be_nice && bytes_deleted > SZ_32M &&
+	    btrfs_should_end_transaction(trans)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = 0;
+		/* There are no items in the tree for us to truncate, we're done */
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+
+	while (1) {
+		u64 clear_start = 0, clear_len = 0;
+
+		fi = NULL;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		found_type = found_key.type;
+
+		if (found_key.objectid != ino)
+			break;
+
+		if (found_type < min_type)
+			break;
+
+		item_end = found_key.offset;
+		if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			extent_type = btrfs_file_extent_type(leaf, fi);
+			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+				item_end +=
+				    btrfs_file_extent_num_bytes(leaf, fi);
+
+				trace_btrfs_truncate_show_fi_regular(
+					inode, leaf, fi, found_key.offset);
+			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				item_end += btrfs_file_extent_ram_bytes(leaf, fi);
+
+				trace_btrfs_truncate_show_fi_inline(
+					inode, leaf, fi, path->slots[0],
+					found_key.offset);
+			}
+			item_end--;
+		}
+		if (found_type > min_type) {
+			del_item = 1;
+		} else {
+			if (item_end < new_size)
+				break;
+			if (found_key.offset >= new_size)
+				del_item = 1;
+			else
+				del_item = 0;
+		}
+		found_extent = 0;
+		/* FIXME, shrink the extent if the ref count is only 1 */
+		if (found_type != BTRFS_EXTENT_DATA_KEY)
+			goto delete;
+
+		if (extents_found != NULL)
+			(*extents_found)++;
+
+		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+			u64 num_dec;
+
+			clear_start = found_key.offset;
+			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+			if (!del_item) {
+				u64 orig_num_bytes =
+					btrfs_file_extent_num_bytes(leaf, fi);
+				extent_num_bytes = ALIGN(new_size -
+						found_key.offset,
+						fs_info->sectorsize);
+				clear_start = ALIGN(new_size, fs_info->sectorsize);
+
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							 extent_num_bytes);
+				num_dec = (orig_num_bytes - extent_num_bytes);
+				if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
+				    extent_start != 0)
+					inode_sub_bytes(&inode->vfs_inode,
+							num_dec);
+				btrfs_mark_buffer_dirty(leaf);
+			} else {
+				extent_num_bytes =
+					btrfs_file_extent_disk_num_bytes(leaf, fi);
+				extent_offset = found_key.offset -
+					btrfs_file_extent_offset(leaf, fi);
+
+				/* FIXME blocksize != 4096 */
+				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+				if (extent_start != 0) {
+					found_extent = 1;
+					if (test_bit(BTRFS_ROOT_SHAREABLE,
+						     &root->state))
+						inode_sub_bytes(&inode->vfs_inode,
+								num_dec);
+				}
+			}
+			clear_len = num_dec;
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			/*
+			 * We can't truncate inline items that have had
+			 * special encodings
+			 */
+			if (!del_item &&
+			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+			    btrfs_file_extent_compression(leaf, fi) == 0) {
+				u32 size = (u32)(new_size - found_key.offset);
+
+				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+				size = btrfs_file_extent_calc_inline_size(size);
+				btrfs_truncate_item(path, size, 1);
+			} else if (!del_item) {
+				/*
+				 * We have to bail so the last_size is set to
+				 * just before this extent.
+				 */
+				ret = BTRFS_NEED_TRUNCATE_BLOCK;
+				break;
+			} else {
+				/*
+				 * Inline extents are special, we just treat
+				 * them as a full sector worth in the file
+				 * extent tree just for simplicity sake.
+				 */
+				clear_len = fs_info->sectorsize;
+			}
+
+			if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+				inode_sub_bytes(&inode->vfs_inode,
+						item_end + 1 - new_size);
+		}
+delete:
+		/*
+		 * We use btrfs_truncate_inode_items() to clean up log trees for
+		 * multiple fsyncs, and in this case we don't want to clear the
+		 * file extent range because it's just the log.
+		 */
+		if (root == inode->root) {
+			ret = btrfs_inode_clear_file_extent_range(inode,
+						  clear_start, clear_len);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				break;
+			}
+		}
+
+		if (del_item)
+			last_size = found_key.offset;
+		else
+			last_size = new_size;
+		if (del_item) {
+			if (!pending_del_nr) {
+				/* No pending yet, add ourselves */
+				pending_del_slot = path->slots[0];
+				pending_del_nr = 1;
+			} else if (pending_del_nr &&
+				   path->slots[0] + 1 == pending_del_slot) {
+				/* Hop on the pending chunk */
+				pending_del_nr++;
+				pending_del_slot = path->slots[0];
+			} else {
+				BUG();
+			}
+		} else {
+			break;
+		}
+		should_throttle = false;
+
+		if (found_extent &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+			struct btrfs_ref ref = { 0 };
+
+			bytes_deleted += extent_num_bytes;
+
+			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
+					extent_start, extent_num_bytes, 0);
+			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+					ino, extent_offset,
+					root->root_key.objectid, false);
+			ret = btrfs_free_extent(trans, &ref);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				break;
+			}
+			if (be_nice) {
+				if (btrfs_should_throttle_delayed_refs(trans))
+					should_throttle = true;
+			}
+		}
+
+		if (found_type == BTRFS_INODE_ITEM_KEY)
+			break;
+
+		if (path->slots[0] == 0 ||
+		    path->slots[0] != pending_del_slot ||
+		    should_throttle) {
+			if (pending_del_nr) {
+				ret = btrfs_del_items(trans, root, path,
+						pending_del_slot,
+						pending_del_nr);
+				if (ret) {
+					btrfs_abort_transaction(trans, ret);
+					break;
+				}
+				pending_del_nr = 0;
+			}
+			btrfs_release_path(path);
+
+			/*
+			 * We can generate a lot of delayed refs, so we need to
+			 * throttle every once and a while and make sure we're
+			 * adding enough space to keep up with the work we are
+			 * generating.  Since we hold a transaction here we
+			 * can't flush, and we don't want to FLUSH_LIMIT because
+			 * we could have generated too many delayed refs to
+			 * actually allocate, so just bail if we're short and
+			 * let the normal reservation dance happen higher up.
+			 */
+			if (should_throttle) {
+				ret = btrfs_delayed_refs_rsv_refill(fs_info,
+							BTRFS_RESERVE_NO_FLUSH);
+				if (ret) {
+					ret = -EAGAIN;
+					break;
+				}
+			}
+			goto search_again;
+		} else {
+			path->slots[0]--;
+		}
+	}
+out:
+	if (ret >= 0 && pending_del_nr) {
+		int err;
+
+		err = btrfs_del_items(trans, root, path, pending_del_slot,
+				      pending_del_nr);
+		if (err) {
+			btrfs_abort_transaction(trans, err);
+			ret = err;
+		}
+	}
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		ASSERT(last_size >= new_size);
+		if (!ret && last_size > new_size)
+			last_size = new_size;
+		btrfs_inode_safe_disk_i_size_write(inode, last_size);
+		unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
+				     &cached_state);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 30476bcdf413..4464d7026180 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -10,8 +10,19 @@ struct btrfs_root;
 struct btrfs_path;
 struct btrfs_key;
 struct btrfs_inode_extref;
+struct btrfs_inode;
 struct extent_buffer;
 
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define BTRFS_NEED_TRUNCATE_BLOCK		1
+
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_inode *inode, u64 new_size,
+			       u32 min_type, u64 *extents_found);
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6ccdcf76b02f..c29e7c87ff27 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4616,389 +4616,6 @@ out:
 	return err;
 }
 
-/*
- * Return this if we need to call truncate_block for the last bit of the
- * truncate.
- */
-#define NEED_TRUNCATE_BLOCK 1
-
-/*
- * Remove inode items from a given root.
- *
- * @trans:		A transaction handle.
- * @root:		The root from which to remove items.
- * @inode:		The inode whose items we want to remove.
- * @new_size:		The new i_size for the inode. This is only applicable when
- *			@min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
- * @min_type:		The minimum key type to remove. All keys with a type
- *			greater than this value are removed and all keys with
- *			this type are removed only if their offset is >= @new_size.
- * @extents_found:	Output parameter that will contain the number of file
- *			extent items that were removed or adjusted to the new
- *			inode i_size. The caller is responsible for initializing
- *			the counter. Also, it can be NULL if the caller does not
- *			need this counter.
- *
- * Remove all keys associated with the inode from the given root that have a key
- * with a type greater than or equals to @min_type. When @min_type has a value of
- * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
- * greater than or equals to @new_size. If a file extent item that starts before
- * @new_size and ends after it is found, its length is adjusted.
- *
- * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
- * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
- */
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct btrfs_inode *inode,
-			       u64 new_size, u32 min_type,
-			       u64 *extents_found)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	struct btrfs_file_extent_item *fi;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	u64 extent_start = 0;
-	u64 extent_num_bytes = 0;
-	u64 extent_offset = 0;
-	u64 item_end = 0;
-	u64 last_size = new_size;
-	u32 found_type = (u8)-1;
-	int found_extent;
-	int del_item;
-	int pending_del_nr = 0;
-	int pending_del_slot = 0;
-	int extent_type = -1;
-	int ret;
-	u64 ino = btrfs_ino(inode);
-	u64 bytes_deleted = 0;
-	bool be_nice = false;
-	bool should_throttle = false;
-	const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
-	struct extent_state *cached_state = NULL;
-
-	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
-
-	/*
-	 * For non-free space inodes and non-shareable roots, we want to back
-	 * off from time to time.  This means all inodes in subvolume roots,
-	 * reloc roots, and data reloc roots.
-	 */
-	if (!btrfs_is_free_space_inode(inode) &&
-	    test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-		be_nice = true;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->reada = READA_BACK;
-
-	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
-				 &cached_state);
-
-		/*
-		 * We want to drop from the next block forward in case this
-		 * new size is not block aligned since we will be keeping the
-		 * last block of the extent just the way it is.
-		 */
-		btrfs_drop_extent_cache(inode, ALIGN(new_size,
-					fs_info->sectorsize),
-					(u64)-1, 0);
-	}
-
-	/*
-	 * This function is also used to drop the items in the log tree before
-	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
-	 * it is used to drop the logged items. So we shouldn't kill the delayed
-	 * items.
-	 */
-	if (min_type == 0 && root == inode->root)
-		btrfs_kill_delayed_inode_items(inode);
-
-	key.objectid = ino;
-	key.offset = (u64)-1;
-	key.type = (u8)-1;
-
-search_again:
-	/*
-	 * with a 16K leaf size and 128MB extents, you can actually queue
-	 * up a huge file in a single leaf.  Most of the time that
-	 * bytes_deleted is > 0, it will be huge by the time we get here
-	 */
-	if (be_nice && bytes_deleted > SZ_32M &&
-	    btrfs_should_end_transaction(trans)) {
-		ret = -EAGAIN;
-		goto out;
-	}
-
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0)
-		goto out;
-
-	if (ret > 0) {
-		ret = 0;
-		/* there are no items in the tree for us to truncate, we're
-		 * done
-		 */
-		if (path->slots[0] == 0)
-			goto out;
-		path->slots[0]--;
-	}
-
-	while (1) {
-		u64 clear_start = 0, clear_len = 0;
-
-		fi = NULL;
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		found_type = found_key.type;
-
-		if (found_key.objectid != ino)
-			break;
-
-		if (found_type < min_type)
-			break;
-
-		item_end = found_key.offset;
-		if (found_type == BTRFS_EXTENT_DATA_KEY) {
-			fi = btrfs_item_ptr(leaf, path->slots[0],
-					    struct btrfs_file_extent_item);
-			extent_type = btrfs_file_extent_type(leaf, fi);
-			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
-				item_end +=
-				    btrfs_file_extent_num_bytes(leaf, fi);
-
-				trace_btrfs_truncate_show_fi_regular(
-					inode, leaf, fi, found_key.offset);
-			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-				item_end += btrfs_file_extent_ram_bytes(leaf,
-									fi);
-
-				trace_btrfs_truncate_show_fi_inline(
-					inode, leaf, fi, path->slots[0],
-					found_key.offset);
-			}
-			item_end--;
-		}
-		if (found_type > min_type) {
-			del_item = 1;
-		} else {
-			if (item_end < new_size)
-				break;
-			if (found_key.offset >= new_size)
-				del_item = 1;
-			else
-				del_item = 0;
-		}
-		found_extent = 0;
-		/* FIXME, shrink the extent if the ref count is only 1 */
-		if (found_type != BTRFS_EXTENT_DATA_KEY)
-			goto delete;
-
-		if (extents_found != NULL)
-			(*extents_found)++;
-
-		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
-			u64 num_dec;
-
-			clear_start = found_key.offset;
-			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item) {
-				u64 orig_num_bytes =
-					btrfs_file_extent_num_bytes(leaf, fi);
-				extent_num_bytes = ALIGN(new_size -
-						found_key.offset,
-						fs_info->sectorsize);
-				clear_start = ALIGN(new_size, fs_info->sectorsize);
-				btrfs_set_file_extent_num_bytes(leaf, fi,
-							 extent_num_bytes);
-				num_dec = (orig_num_bytes -
-					   extent_num_bytes);
-				if (test_bit(BTRFS_ROOT_SHAREABLE,
-					     &root->state) &&
-				    extent_start != 0)
-					inode_sub_bytes(&inode->vfs_inode,
-							num_dec);
-				btrfs_mark_buffer_dirty(leaf);
-			} else {
-				extent_num_bytes =
-					btrfs_file_extent_disk_num_bytes(leaf,
-									 fi);
-				extent_offset = found_key.offset -
-					btrfs_file_extent_offset(leaf, fi);
-
-				/* FIXME blocksize != 4096 */
-				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
-				if (extent_start != 0) {
-					found_extent = 1;
-					if (test_bit(BTRFS_ROOT_SHAREABLE,
-						     &root->state))
-						inode_sub_bytes(&inode->vfs_inode,
-								num_dec);
-				}
-			}
-			clear_len = num_dec;
-		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-			/*
-			 * we can't truncate inline items that have had
-			 * special encodings
-			 */
-			if (!del_item &&
-			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
-			    btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
-			    btrfs_file_extent_compression(leaf, fi) == 0) {
-				u32 size = (u32)(new_size - found_key.offset);
-
-				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-				size = btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(path, size, 1);
-			} else if (!del_item) {
-				/*
-				 * We have to bail so the last_size is set to
-				 * just before this extent.
-				 */
-				ret = NEED_TRUNCATE_BLOCK;
-				break;
-			} else {
-				/*
-				 * Inline extents are special, we just treat
-				 * them as a full sector worth in the file
-				 * extent tree just for simplicity sake.
-				 */
-				clear_len = fs_info->sectorsize;
-			}
-
-			if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-				inode_sub_bytes(&inode->vfs_inode,
-						item_end + 1 - new_size);
-		}
-delete:
-		/*
-		 * We use btrfs_truncate_inode_items() to clean up log trees for
-		 * multiple fsyncs, and in this case we don't want to clear the
-		 * file extent range because it's just the log.
-		 */
-		if (root == inode->root) {
-			ret = btrfs_inode_clear_file_extent_range(inode,
-						  clear_start, clear_len);
-			if (ret) {
-				btrfs_abort_transaction(trans, ret);
-				break;
-			}
-		}
-
-		if (del_item)
-			last_size = found_key.offset;
-		else
-			last_size = new_size;
-		if (del_item) {
-			if (!pending_del_nr) {
-				/* no pending yet, add ourselves */
-				pending_del_slot = path->slots[0];
-				pending_del_nr = 1;
-			} else if (pending_del_nr &&
-				   path->slots[0] + 1 == pending_del_slot) {
-				/* hop on the pending chunk */
-				pending_del_nr++;
-				pending_del_slot = path->slots[0];
-			} else {
-				BUG();
-			}
-		} else {
-			break;
-		}
-		should_throttle = false;
-
-		if (found_extent &&
-		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-			struct btrfs_ref ref = { 0 };
-
-			bytes_deleted += extent_num_bytes;
-
-			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
-					extent_start, extent_num_bytes, 0);
-			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-					ino, extent_offset,
-					root->root_key.objectid, false);
-			ret = btrfs_free_extent(trans, &ref);
-			if (ret) {
-				btrfs_abort_transaction(trans, ret);
-				break;
-			}
-			if (be_nice) {
-				if (btrfs_should_throttle_delayed_refs(trans))
-					should_throttle = true;
-			}
-		}
-
-		if (found_type == BTRFS_INODE_ITEM_KEY)
-			break;
-
-		if (path->slots[0] == 0 ||
-		    path->slots[0] != pending_del_slot ||
-		    should_throttle) {
-			if (pending_del_nr) {
-				ret = btrfs_del_items(trans, root, path,
-						pending_del_slot,
-						pending_del_nr);
-				if (ret) {
-					btrfs_abort_transaction(trans, ret);
-					break;
-				}
-				pending_del_nr = 0;
-			}
-			btrfs_release_path(path);
-
-			/*
-			 * We can generate a lot of delayed refs, so we need to
-			 * throttle every once and a while and make sure we're
-			 * adding enough space to keep up with the work we are
-			 * generating.  Since we hold a transaction here we
-			 * can't flush, and we don't want to FLUSH_LIMIT because
-			 * we could have generated too many delayed refs to
-			 * actually allocate, so just bail if we're short and
-			 * let the normal reservation dance happen higher up.
-			 */
-			if (should_throttle) {
-				ret = btrfs_delayed_refs_rsv_refill(fs_info,
-							BTRFS_RESERVE_NO_FLUSH);
-				if (ret) {
-					ret = -EAGAIN;
-					break;
-				}
-			}
-			goto search_again;
-		} else {
-			path->slots[0]--;
-		}
-	}
-out:
-	if (ret >= 0 && pending_del_nr) {
-		int err;
-
-		err = btrfs_del_items(trans, root, path, pending_del_slot,
-				      pending_del_nr);
-		if (err) {
-			btrfs_abort_transaction(trans, err);
-			ret = err;
-		}
-	}
-	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ASSERT(last_size >= new_size);
-		if (!ret && last_size > new_size)
-			last_size = new_size;
-		btrfs_inode_safe_disk_i_size_write(inode, last_size);
-		unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
-				     &cached_state);
-	}
-
-	btrfs_free_path(path);
-	return ret;
-}
-
 /*
  * btrfs_truncate_block - read, zero a chunk and write a block
  * @inode - inode that we're zeroing
@@ -8997,11 +8614,11 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 
 	/*
 	 * We can't call btrfs_truncate_block inside a trans handle as we could
-	 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
-	 * we've truncated everything except the last little bit, and can do
-	 * btrfs_truncate_block and then update the disk_i_size.
+	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
+	 * know we've truncated everything except the last little bit, and can
+	 * do btrfs_truncate_block and then update the disk_i_size.
 	 */
-	if (ret == NEED_TRUNCATE_BLOCK) {
+	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
 		btrfs_end_transaction(trans);
 		btrfs_btree_balance_dirty(fs_info);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9a4a1429acbe0508095dbb2c54e9e8b78dfe52f0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:05 -0500
Subject: btrfs: move extent locking outside of btrfs_truncate_inode_items

Currently we are locking the extent and dropping the extent cache for
any inodes we truncate, unless they're in the tree log.  We call this
helper from:

- truncate
- evict
- tree log
- free space cache truncation

For evict we've already dropped all of the extent cache for this inode
once we've gotten here, and we're the only one accessing this inode, so
this step is unnecessary.

For the tree log code we already skip this part.

Pull this work into the truncate path and the free space cache
truncation path.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 20 +++++++++++++-------
 fs/btrfs/inode-item.c       | 18 ------------------
 fs/btrfs/inode.c            | 18 ++++++++++++++++++
 3 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 55e1be703a39..28b9c63ba536 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -289,9 +289,11 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
 
 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 				    struct btrfs_block_group *block_group,
-				    struct inode *inode)
+				    struct inode *vfs_inode)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+	struct btrfs_root *root = inode->root;
+	struct extent_state *cached_state = NULL;
 	int ret = 0;
 	bool locked = false;
 
@@ -321,19 +323,23 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 		btrfs_free_path(path);
 	}
 
-	btrfs_i_size_write(BTRFS_I(inode), 0);
-	truncate_pagecache(inode, 0);
+	btrfs_i_size_write(inode, 0);
+	truncate_pagecache(vfs_inode, 0);
+
+	lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 
 	/*
 	 * We skip the throttling logic for free space cache inodes, so we don't
 	 * need to check for -EAGAIN.
 	 */
-	ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
-					 0, BTRFS_EXTENT_DATA_KEY, NULL);
+	ret = btrfs_truncate_inode_items(trans, root, inode, 0,
+					 BTRFS_EXTENT_DATA_KEY, NULL);
+	unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
 	if (ret)
 		goto fail;
 
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, root, inode);
 
 fail:
 	if (locked)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index c43a3610f443..604ace69574b 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -473,8 +473,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 bytes_deleted = 0;
 	bool be_nice = false;
 	bool should_throttle = false;
-	const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
-	struct extent_state *cached_state = NULL;
 
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
@@ -492,20 +490,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	path->reada = READA_BACK;
 
-	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
-				 &cached_state);
-
-		/*
-		 * We want to drop from the next block forward in case this
-		 * new size is not block aligned since we will be keeping the
-		 * last block of the extent just the way it is.
-		 */
-		btrfs_drop_extent_cache(inode, ALIGN(new_size,
-					fs_info->sectorsize),
-					(u64)-1, 0);
-	}
-
 	/*
 	 * This function is also used to drop the items in the log tree before
 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
@@ -783,8 +767,6 @@ out:
 		if (!ret && last_size > new_size)
 			last_size = new_size;
 		btrfs_inode_safe_disk_i_size_write(inode, last_size);
-		unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
-				     &cached_state);
 	}
 
 	btrfs_free_path(path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c29e7c87ff27..9f9cd6c598fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8583,10 +8583,28 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 	trans->block_rsv = rsv;
 
 	while (1) {
+		struct extent_state *cached_state = NULL;
+		const u64 new_size = inode->i_size;
+		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+				 &cached_state);
+		/*
+		 * We want to drop from the next block forward in case this new
+		 * size is not block aligned since we will be keeping the last
+		 * block of the extent just the way it is.
+		 */
+		btrfs_drop_extent_cache(BTRFS_I(inode),
+					ALIGN(new_size, fs_info->sectorsize),
+					(u64)-1, 0);
+
 		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY,
 						 &extents_found);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
+				     (u64)-1, &cached_state);
+
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		if (ret != -ENOSPC && ret != -EAGAIN)
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 275312a03c625a35c33440d3937804b17f894e4f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:06 -0500
Subject: btrfs: remove free space cache inode check in
 btrfs_truncate_inode_items

We no longer have inode cache feature, so this check is extraneous as
the only inode cache is in the tree_root, which is not marked as
SHAREABLE.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 604ace69574b..379baeccc7d6 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -477,12 +477,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
 	/*
-	 * For non-free space inodes and non-shareable roots, we want to back
-	 * off from time to time.  This means all inodes in subvolume roots,
-	 * reloc roots, and data reloc roots.
+	 * For shareable roots we want to back off from time to time, this turns
+	 * out to be subvolume roots, reloc roots, and data reloc roots.
 	 */
-	if (!btrfs_is_free_space_inode(inode) &&
-	    test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
 		be_nice = true;
 
 	path = btrfs_alloc_path();
-- 
cgit v1.2.3-59-g8ed1b


From 2adc75d61203a7d7d0178e1c5402e1f792ac4636 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:07 -0500
Subject: btrfs: move btrfs_kill_delayed_inode_items into evict

We have a special case in btrfs_truncate_inode_items() to call
btrfs_kill_delayed_inode_items() if min_type == 0, which is only called
during evict.

Instead move this out into evict proper, and add some comments because I
erroneously attempted to remove this code altogether without
understanding what we were doing.

Evict is updating the inode only because we only care about making sure
the i_nlink count has hit disk.  If we had pending deletions we don't
want to process those via the delayed inode updates, we simply want to
drop all of them and reclaim the reserved metadata space.  Then from
there the btrfs_truncate_inode_items() will do the work to remove all of
the items as appropriate.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c |  9 ---------
 fs/btrfs/inode.c      | 12 ++++++++++++
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 379baeccc7d6..a7c1408cada8 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -488,15 +488,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	path->reada = READA_BACK;
 
-	/*
-	 * This function is also used to drop the items in the log tree before
-	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
-	 * it is used to drop the logged items. So we shouldn't kill the delayed
-	 * items.
-	 */
-	if (min_type == 0 && root == inode->root)
-		btrfs_kill_delayed_inode_items(inode);
-
 	key.objectid = ino;
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9f9cd6c598fe..7aabf419e967 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5226,10 +5226,22 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 
+	/*
+	 * This makes sure the inode item in tree is uptodate and the space for
+	 * the inode update is released.
+	 */
 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
 	if (ret)
 		goto no_delete;
 
+	/*
+	 * This drops any pending insert or delete operations we have for this
+	 * inode.  We could have a delayed dir index deletion queued up, but
+	 * we're removing the inode completely so that'll be taken care of in
+	 * the truncate.
+	 */
+	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+
 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv)
 		goto no_delete;
-- 
cgit v1.2.3-59-g8ed1b


From 7097a941bf75ed1f632342937984a24f0b1d76a4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:08 -0500
Subject: btrfs: remove found_extent from btrfs_truncate_inode_items

We only set this if we find a normal file extent, del_item == 1, and the
file extent points to a real extent and isn't a hole extent.  We can use
del_item == 1 && extent_start != 0 to get the same information that
found_extent provides, so remove this variable and use the other
variables instead.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a7c1408cada8..b79578895d3b 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -457,13 +457,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	u64 extent_start = 0;
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
 	u64 last_size = new_size;
 	u32 found_type = (u8)-1;
-	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
@@ -517,7 +515,7 @@ search_again:
 	}
 
 	while (1) {
-		u64 clear_start = 0, clear_len = 0;
+		u64 clear_start = 0, clear_len = 0, extent_start = 0;
 
 		fi = NULL;
 		leaf = path->nodes[0];
@@ -560,7 +558,7 @@ search_again:
 			else
 				del_item = 0;
 		}
-		found_extent = 0;
+
 		/* FIXME, shrink the extent if the ref count is only 1 */
 		if (found_type != BTRFS_EXTENT_DATA_KEY)
 			goto delete;
@@ -598,7 +596,6 @@ search_again:
 				/* FIXME blocksize != 4096 */
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
-					found_extent = 1;
 					if (test_bit(BTRFS_ROOT_SHAREABLE,
 						     &root->state))
 						inode_sub_bytes(&inode->vfs_inode,
@@ -677,7 +674,7 @@ delete:
 		}
 		should_throttle = false;
 
-		if (found_extent &&
+		if (del_item && extent_start != 0 &&
 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 			struct btrfs_ref ref = { 0 };
 
-- 
cgit v1.2.3-59-g8ed1b


From d9ac19c3806434af2a92b4008dbf39c2a67c60cf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:09 -0500
Subject: btrfs: add truncate control struct

I'm going to be adding more arguments and counters to
btrfs_truncate_inode_items, so add a control struct to handle all of the
extra arguments to make it easier to follow.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c |  7 +++++--
 fs/btrfs/inode-item.c       | 25 ++++++++-----------------
 fs/btrfs/inode-item.h       | 18 ++++++++++++++++--
 fs/btrfs/inode.c            | 18 ++++++++++++------
 fs/btrfs/tree-log.c         |  6 +++++-
 5 files changed, 46 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 28b9c63ba536..a05dd3d29695 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -291,6 +291,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 				    struct btrfs_block_group *block_group,
 				    struct inode *vfs_inode)
 {
+	struct btrfs_truncate_control control = {
+		.new_size = 0,
+		.min_type = BTRFS_EXTENT_DATA_KEY,
+	};
 	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
 	struct btrfs_root *root = inode->root;
 	struct extent_state *cached_state = NULL;
@@ -333,8 +337,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	 * We skip the throttling logic for free space cache inodes, so we don't
 	 * need to check for -EAGAIN.
 	 */
-	ret = btrfs_truncate_inode_items(trans, root, inode, 0,
-					 BTRFS_EXTENT_DATA_KEY, NULL);
+	ret = btrfs_truncate_inode_items(trans, root, inode, &control);
 	unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
 	if (ret)
 		goto fail;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b79578895d3b..0946a3912afd 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -425,16 +425,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
  * @trans:		A transaction handle.
  * @root:		The root from which to remove items.
  * @inode:		The inode whose items we want to remove.
- * @new_size:		The new i_size for the inode. This is only applicable when
- *			@min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
- * @min_type:		The minimum key type to remove. All keys with a type
- *			greater than this value are removed and all keys with
- *			this type are removed only if their offset is >= @new_size.
- * @extents_found:	Output parameter that will contain the number of file
- *			extent items that were removed or adjusted to the new
- *			inode i_size. The caller is responsible for initializing
- *			the counter. Also, it can be NULL if the caller does not
- *			need this counter.
+ * @control:		The btrfs_truncate_control to control how and what we
+ *			are truncating.
  *
  * Remove all keys associated with the inode from the given root that have a key
  * with a type greater than or equals to @min_type. When @min_type has a value of
@@ -448,8 +440,7 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct btrfs_inode *inode,
-			       u64 new_size, u32 min_type,
-			       u64 *extents_found)
+			       struct btrfs_truncate_control *control)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_path *path;
@@ -457,6 +448,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	u64 new_size = control->new_size;
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
@@ -472,7 +464,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	bool be_nice = false;
 	bool should_throttle = false;
 
-	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+	BUG_ON(new_size > 0 && control->min_type != BTRFS_EXTENT_DATA_KEY);
 
 	/*
 	 * For shareable roots we want to back off from time to time, this turns
@@ -525,7 +517,7 @@ search_again:
 		if (found_key.objectid != ino)
 			break;
 
-		if (found_type < min_type)
+		if (found_type < control->min_type)
 			break;
 
 		item_end = found_key.offset;
@@ -548,7 +540,7 @@ search_again:
 			}
 			item_end--;
 		}
-		if (found_type > min_type) {
+		if (found_type > control->min_type) {
 			del_item = 1;
 		} else {
 			if (item_end < new_size)
@@ -563,8 +555,7 @@ search_again:
 		if (found_type != BTRFS_EXTENT_DATA_KEY)
 			goto delete;
 
-		if (extents_found != NULL)
-			(*extents_found)++;
+		control->extents_found++;
 
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 4464d7026180..beefba1f26bc 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -19,10 +19,24 @@ struct extent_buffer;
  */
 #define BTRFS_NEED_TRUNCATE_BLOCK		1
 
+struct btrfs_truncate_control {
+	/* IN: the size we're truncating to. */
+	u64 new_size;
+
+	/* OUT: the number of extents truncated. */
+	u64 extents_found;
+
+	/*
+	 * IN: minimum key type to remove.  All key types with this type are
+	 * removed only if their offset >= new_size.
+	 */
+	u32 min_type;
+};
+
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct btrfs_inode *inode, u64 new_size,
-			       u32 min_type, u64 *extents_found);
+			       struct btrfs_inode *inode,
+			       struct btrfs_truncate_control *control);
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7aabf419e967..9eb57d90ff59 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5251,6 +5251,11 @@ void btrfs_evict_inode(struct inode *inode)
 	btrfs_i_size_write(BTRFS_I(inode), 0);
 
 	while (1) {
+		struct btrfs_truncate_control control = {
+			.new_size = 0,
+			.min_type = 0,
+		};
+
 		trans = evict_refill_and_join(root, rsv);
 		if (IS_ERR(trans))
 			goto free_rsv;
@@ -5258,7 +5263,7 @@ void btrfs_evict_inode(struct inode *inode)
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
-						 0, 0, NULL);
+						 &control);
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		btrfs_end_transaction(trans);
 		btrfs_btree_balance_dirty(fs_info);
@@ -8527,6 +8532,9 @@ out_noreserve:
 
 static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 {
+	struct btrfs_truncate_control control = {
+		.min_type = BTRFS_EXTENT_DATA_KEY,
+	};
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv;
@@ -8534,7 +8542,6 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 	struct btrfs_trans_handle *trans;
 	u64 mask = fs_info->sectorsize - 1;
 	u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
-	u64 extents_found = 0;
 
 	if (!skip_writeback) {
 		ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -8599,6 +8606,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 		const u64 new_size = inode->i_size;
 		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
 
+		control.new_size = new_size;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
 				 &cached_state);
 		/*
@@ -8611,9 +8619,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 					(u64)-1, 0);
 
 		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
-						 inode->i_size,
-						 BTRFS_EXTENT_DATA_KEY,
-						 &extents_found);
+						 &control);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
 				     (u64)-1, &cached_state);
 
@@ -8692,7 +8698,7 @@ out:
 	 * between the old i_size and the new i_size, and there were no prealloc
 	 * extents beyond i_size to drop.
 	 */
-	if (extents_found > 0)
+	if (control.extents_found > 0)
 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 
 	return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 64b42f4b11f2..c732a429a1c6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4098,11 +4098,15 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
 				struct btrfs_inode *inode,
 				u64 new_size, u32 min_type)
 {
+	struct btrfs_truncate_control control = {
+		.new_size = new_size,
+		.min_type = min_type,
+	};
 	int ret;
 
 	do {
 		ret = btrfs_truncate_inode_items(trans, log_root, inode,
-						 new_size, min_type, NULL);
+						 &control);
 	} while (ret == -EAGAIN);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From c2ddb612a8b320dde8641a74c35e107aa496d5f3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:10 -0500
Subject: btrfs: only update i_size in truncate paths that care

We currently will update the i_size of the inode as we truncate it down,
however we skip this if we're calling btrfs_truncate_inode_items from
the tree log code.  However we also don't care about this in the case of
evict.  Instead keep track of this value in the btrfs_truncate_control
and then have btrfs_truncate() and the free space cache truncate path
both do the i_size update themselves.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c |  3 +++
 fs/btrfs/inode-item.c       | 17 ++++++++---------
 fs/btrfs/inode-item.h       |  3 +++
 fs/btrfs/inode.c            |  3 +++
 4 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a05dd3d29695..fd469beb0985 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -338,6 +338,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	 * need to check for -EAGAIN.
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode, &control);
+
+	btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
+
 	unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
 	if (ret)
 		goto fail;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 0946a3912afd..7bb9f557ee3d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -452,7 +452,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
-	u64 last_size = new_size;
 	u32 found_type = (u8)-1;
 	int del_item;
 	int pending_del_nr = 0;
@@ -466,6 +465,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	BUG_ON(new_size > 0 && control->min_type != BTRFS_EXTENT_DATA_KEY);
 
+	control->last_size = new_size;
+
 	/*
 	 * For shareable roots we want to back off from time to time, this turns
 	 * out to be subvolume roots, reloc roots, and data reloc roots.
@@ -644,9 +645,9 @@ delete:
 		}
 
 		if (del_item)
-			last_size = found_key.offset;
+			control->last_size = found_key.offset;
 		else
-			last_size = new_size;
+			control->last_size = new_size;
 		if (del_item) {
 			if (!pending_del_nr) {
 				/* No pending yet, add ourselves */
@@ -739,12 +740,10 @@ out:
 			ret = err;
 		}
 	}
-	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ASSERT(last_size >= new_size);
-		if (!ret && last_size > new_size)
-			last_size = new_size;
-		btrfs_inode_safe_disk_i_size_write(inode, last_size);
-	}
+
+	ASSERT(control->last_size >= new_size);
+	if (!ret && control->last_size > new_size)
+		control->last_size = new_size;
 
 	btrfs_free_path(path);
 	return ret;
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index beefba1f26bc..883e150f011a 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -26,6 +26,9 @@ struct btrfs_truncate_control {
 	/* OUT: the number of extents truncated. */
 	u64 extents_found;
 
+	/* OUT: the last size we truncated this inode to. */
+	u64 last_size;
+
 	/*
 	 * IN: minimum key type to remove.  All key types with this type are
 	 * removed only if their offset >= new_size.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9eb57d90ff59..f653242e3c19 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8620,6 +8620,9 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 
 		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
 						 &control);
+
+		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
 				     (u64)-1, &cached_state);
 
-- 
cgit v1.2.3-59-g8ed1b


From 462b728ea83fa85f1c0d2b79efb6187745444ce5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:11 -0500
Subject: btrfs: only call inode_sub_bytes in truncate paths that care

We currently have a bunch of awkward checks to make sure we only update
the inode i_bytes if we're truncating the real inode.  Instead keep
track of the number of bytes we need to sub in the
btrfs_truncate_control, and then do the appropriate adjustment in the
truncate paths that care.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c |  1 +
 fs/btrfs/inode-item.c       | 19 ++++++-------------
 fs/btrfs/inode-item.h       |  3 +++
 fs/btrfs/inode.c            |  1 +
 4 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index fd469beb0985..d2f4716f8485 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -339,6 +339,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode, &control);
 
+	inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
 	btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
 
 	unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 7bb9f557ee3d..278d579f50bd 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -466,6 +466,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	BUG_ON(new_size > 0 && control->min_type != BTRFS_EXTENT_DATA_KEY);
 
 	control->last_size = new_size;
+	control->sub_bytes = 0;
 
 	/*
 	 * For shareable roots we want to back off from time to time, this turns
@@ -574,10 +575,8 @@ search_again:
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes - extent_num_bytes);
-				if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
-				    extent_start != 0)
-					inode_sub_bytes(&inode->vfs_inode,
-							num_dec);
+				if (extent_start != 0)
+					control->sub_bytes += num_dec;
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				extent_num_bytes =
@@ -587,12 +586,8 @@ search_again:
 
 				/* FIXME blocksize != 4096 */
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
-				if (extent_start != 0) {
-					if (test_bit(BTRFS_ROOT_SHAREABLE,
-						     &root->state))
-						inode_sub_bytes(&inode->vfs_inode,
-								num_dec);
-				}
+				if (extent_start != 0)
+					control->sub_bytes += num_dec;
 			}
 			clear_len = num_dec;
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -625,9 +620,7 @@ search_again:
 				clear_len = fs_info->sectorsize;
 			}
 
-			if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-				inode_sub_bytes(&inode->vfs_inode,
-						item_end + 1 - new_size);
+			control->sub_bytes += item_end + 1 - new_size;
 		}
 delete:
 		/*
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 883e150f011a..7b5b455262cb 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -29,6 +29,9 @@ struct btrfs_truncate_control {
 	/* OUT: the last size we truncated this inode to. */
 	u64 last_size;
 
+	/* OUT: the number of bytes to sub from this inode. */
+	u64 sub_bytes;
+
 	/*
 	 * IN: minimum key type to remove.  All key types with this type are
 	 * removed only if their offset >= new_size.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f653242e3c19..38f974b16fc0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8621,6 +8621,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
 						 &control);
 
+		inode_sub_bytes(inode, control.sub_bytes);
 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
 
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
-- 
cgit v1.2.3-59-g8ed1b


From 5caa490ed8f07488e47378999bd4ad451bf8858b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:12 -0500
Subject: btrfs: control extent reference updates with a control flag for
 truncate

We've had weird bugs in the past where we forgot to adjust the truncate
path to deal with the fact that we can be called by the tree log path.
Instead of checking if our root is a LOG_ROOT use a flag on the
btrfs_truncate_control to indicate that we don't want to do extent
reference updates during this truncate.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 3 +--
 fs/btrfs/inode-item.h | 6 ++++++
 fs/btrfs/tree-log.c   | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 278d579f50bd..4c753415ab06 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -659,8 +659,7 @@ delete:
 		}
 		should_throttle = false;
 
-		if (del_item && extent_start != 0 &&
-		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		if (del_item && extent_start != 0 && !control->skip_ref_updates) {
 			struct btrfs_ref ref = { 0 };
 
 			bytes_deleted += extent_num_bytes;
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 7b5b455262cb..1f31bb407f4a 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -37,6 +37,12 @@ struct btrfs_truncate_control {
 	 * removed only if their offset >= new_size.
 	 */
 	u32 min_type;
+
+	/*
+	 * IN: true if we don't want to do extent reference updates for any file
+	 * extents we drop.
+	 */
+	bool skip_ref_updates;
 };
 
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c732a429a1c6..2b5e24c24a6b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4101,6 +4101,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_truncate_control control = {
 		.new_size = new_size,
 		.min_type = min_type,
+		.skip_ref_updates = true,
 	};
 	int ret;
 
-- 
cgit v1.2.3-59-g8ed1b


From 655807b8957ba84a583104c422a8f53725997d55 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:13 -0500
Subject: btrfs: use a flag to control when to clear the file extent range

We only care about updating the file extent range when we are doing a
normal truncation.  We skip this for tree logging currently, but we can
also skip this for eviction as well.  Using a flag makes it more
explicit when we want to do this work.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 fs/btrfs/inode-item.c       | 8 ++++----
 fs/btrfs/inode-item.h       | 6 ++++++
 fs/btrfs/inode.c            | 1 +
 4 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d2f4716f8485..3a6bf361409b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -294,6 +294,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	struct btrfs_truncate_control control = {
 		.new_size = 0,
 		.min_type = BTRFS_EXTENT_DATA_KEY,
+		.clear_extent_range = true,
 	};
 	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
 	struct btrfs_root *root = inode->root;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 4c753415ab06..5598090bd0aa 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -624,11 +624,11 @@ search_again:
 		}
 delete:
 		/*
-		 * We use btrfs_truncate_inode_items() to clean up log trees for
-		 * multiple fsyncs, and in this case we don't want to clear the
-		 * file extent range because it's just the log.
+		 * We only want to clear the file extent range if we're
+		 * modifying the actual inode's mapping, which is just the
+		 * normal truncate path.
 		 */
-		if (root == inode->root) {
+		if (control->clear_extent_range) {
 			ret = btrfs_inode_clear_file_extent_range(inode,
 						  clear_start, clear_len);
 			if (ret) {
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 1f31bb407f4a..5817ba4ddd23 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -43,6 +43,12 @@ struct btrfs_truncate_control {
 	 * extents we drop.
 	 */
 	bool skip_ref_updates;
+
+	/*
+	 * IN: true if we need to clear the file extent range for the inode as
+	 * we drop the file extent items.
+	 */
+	bool clear_extent_range;
 };
 
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 38f974b16fc0..1a3f40be4ed8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8534,6 +8534,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 {
 	struct btrfs_truncate_control control = {
 		.min_type = BTRFS_EXTENT_DATA_KEY,
+		.clear_extent_range = true,
 	};
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-- 
cgit v1.2.3-59-g8ed1b


From 487e81d2a4009d17dcfe7c67b78b75cd96bcdde3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:14 -0500
Subject: btrfs: pass the ino via truncate control

In the future we are going to want to truncate inode items without
needing to have an btrfs_inode to pass in, so add ino to the
btrfs_truncate_control and use that to look up the inode items to
truncate.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 fs/btrfs/inode-item.c       | 7 +++----
 fs/btrfs/inode-item.h       | 3 +++
 fs/btrfs/inode.c            | 2 ++
 fs/btrfs/tree-log.c         | 1 +
 5 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3a6bf361409b..c2a34179bddc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -293,6 +293,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_truncate_control control = {
 		.new_size = 0,
+		.ino = btrfs_ino(BTRFS_I(vfs_inode)),
 		.min_type = BTRFS_EXTENT_DATA_KEY,
 		.clear_extent_range = true,
 	};
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 5598090bd0aa..a24ed49daa4e 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -458,7 +458,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int pending_del_slot = 0;
 	int extent_type = -1;
 	int ret;
-	u64 ino = btrfs_ino(inode);
 	u64 bytes_deleted = 0;
 	bool be_nice = false;
 	bool should_throttle = false;
@@ -480,7 +479,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	path->reada = READA_BACK;
 
-	key.objectid = ino;
+	key.objectid = control->ino;
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
@@ -516,7 +515,7 @@ search_again:
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = found_key.type;
 
-		if (found_key.objectid != ino)
+		if (found_key.objectid != control->ino)
 			break;
 
 		if (found_type < control->min_type)
@@ -667,7 +666,7 @@ delete:
 			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
 					extent_start, extent_num_bytes, 0);
 			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-					ino, extent_offset,
+					control->ino, extent_offset,
 					root->root_key.objectid, false);
 			ret = btrfs_free_extent(trans, &ref);
 			if (ret) {
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index 5817ba4ddd23..b0dc14efde30 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -32,6 +32,9 @@ struct btrfs_truncate_control {
 	/* OUT: the number of bytes to sub from this inode. */
 	u64 sub_bytes;
 
+	/* IN: the ino we are truncating. */
+	u64 ino;
+
 	/*
 	 * IN: minimum key type to remove.  All key types with this type are
 	 * removed only if their offset >= new_size.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1a3f40be4ed8..ea1223c53a10 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5252,6 +5252,7 @@ void btrfs_evict_inode(struct inode *inode)
 
 	while (1) {
 		struct btrfs_truncate_control control = {
+			.ino = btrfs_ino(BTRFS_I(inode)),
 			.new_size = 0,
 			.min_type = 0,
 		};
@@ -8533,6 +8534,7 @@ out_noreserve:
 static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 {
 	struct btrfs_truncate_control control = {
+		.ino = btrfs_ino(BTRFS_I(inode)),
 		.min_type = BTRFS_EXTENT_DATA_KEY,
 		.clear_extent_range = true,
 	};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2b5e24c24a6b..8c099cd40956 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4100,6 +4100,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_truncate_control control = {
 		.new_size = new_size,
+		.ino = btrfs_ino(inode),
 		.min_type = min_type,
 		.skip_ref_updates = true,
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 71d18b53540f106a394cb35ed93b487b76678b06 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:15 -0500
Subject: btrfs: add inode to truncate control

In the future we're going to want to use btrfs_truncate_inode_items
without looking up the associated inode.  In order to accommodate this
add the inode to btrfs_truncate_control and handle the case where
control->inode is NULL appropriately.  This is fairly straightforward,
we simply need to add a helper for the trace points, as the file extent
map update is controlled by a flag on btrfs_truncate_control.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c |  3 ++-
 fs/btrfs/inode-item.c       | 32 +++++++++++++++++++++-----------
 fs/btrfs/inode-item.h       |  7 ++++++-
 fs/btrfs/inode.c            |  8 ++++----
 fs/btrfs/tree-log.c         |  3 +--
 5 files changed, 34 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index c2a34179bddc..01a408db5683 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -292,6 +292,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 				    struct inode *vfs_inode)
 {
 	struct btrfs_truncate_control control = {
+		.inode = BTRFS_I(vfs_inode),
 		.new_size = 0,
 		.ino = btrfs_ino(BTRFS_I(vfs_inode)),
 		.min_type = BTRFS_EXTENT_DATA_KEY,
@@ -339,7 +340,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	 * We skip the throttling logic for free space cache inodes, so we don't
 	 * need to check for -EAGAIN.
 	 */
-	ret = btrfs_truncate_inode_items(trans, root, inode, &control);
+	ret = btrfs_truncate_inode_items(trans, root, &control);
 
 	inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
 	btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a24ed49daa4e..44db08e46a49 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -419,6 +419,20 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
+					struct extent_buffer *leaf,
+					struct btrfs_file_extent_item *fi,
+					u64 offset, int extent_type, int slot)
+{
+	if (!inode)
+		return;
+	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+		trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot,
+						    offset);
+	else
+		trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset);
+}
+
 /*
  * Remove inode items from a given root.
  *
@@ -439,7 +453,6 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
  */
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct btrfs_inode *inode,
 			       struct btrfs_truncate_control *control)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -462,6 +475,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	bool be_nice = false;
 	bool should_throttle = false;
 
+	ASSERT(control->inode || !control->clear_extent_range);
 	BUG_ON(new_size > 0 && control->min_type != BTRFS_EXTENT_DATA_KEY);
 
 	control->last_size = new_size;
@@ -526,19 +540,15 @@ search_again:
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
-			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+			if (extent_type != BTRFS_FILE_EXTENT_INLINE)
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
-
-				trace_btrfs_truncate_show_fi_regular(
-					inode, leaf, fi, found_key.offset);
-			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
 				item_end += btrfs_file_extent_ram_bytes(leaf, fi);
 
-				trace_btrfs_truncate_show_fi_inline(
-					inode, leaf, fi, path->slots[0],
-					found_key.offset);
-			}
+			btrfs_trace_truncate(control->inode, leaf, fi,
+					     found_key.offset, extent_type,
+					     path->slots[0]);
 			item_end--;
 		}
 		if (found_type > control->min_type) {
@@ -628,7 +638,7 @@ delete:
 		 * normal truncate path.
 		 */
 		if (control->clear_extent_range) {
-			ret = btrfs_inode_clear_file_extent_range(inode,
+			ret = btrfs_inode_clear_file_extent_range(control->inode,
 						  clear_start, clear_len);
 			if (ret) {
 				btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index b0dc14efde30..a8fc16d0147f 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -20,6 +20,12 @@ struct extent_buffer;
 #define BTRFS_NEED_TRUNCATE_BLOCK		1
 
 struct btrfs_truncate_control {
+	/*
+	 * IN: the inode we're operating on, this can be NULL if
+	 * ->clear_extent_range is false.
+	 */
+	struct btrfs_inode *inode;
+
 	/* IN: the size we're truncating to. */
 	u64 new_size;
 
@@ -56,7 +62,6 @@ struct btrfs_truncate_control {
 
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct btrfs_inode *inode,
 			       struct btrfs_truncate_control *control);
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ea1223c53a10..a88130c7782e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5252,6 +5252,7 @@ void btrfs_evict_inode(struct inode *inode)
 
 	while (1) {
 		struct btrfs_truncate_control control = {
+			.inode = BTRFS_I(inode),
 			.ino = btrfs_ino(BTRFS_I(inode)),
 			.new_size = 0,
 			.min_type = 0,
@@ -5263,8 +5264,7 @@ void btrfs_evict_inode(struct inode *inode)
 
 		trans->block_rsv = rsv;
 
-		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
-						 &control);
+		ret = btrfs_truncate_inode_items(trans, root, &control);
 		trans->block_rsv = &fs_info->trans_block_rsv;
 		btrfs_end_transaction(trans);
 		btrfs_btree_balance_dirty(fs_info);
@@ -8534,6 +8534,7 @@ out_noreserve:
 static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 {
 	struct btrfs_truncate_control control = {
+		.inode = BTRFS_I(inode),
 		.ino = btrfs_ino(BTRFS_I(inode)),
 		.min_type = BTRFS_EXTENT_DATA_KEY,
 		.clear_extent_range = true,
@@ -8621,8 +8622,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 					ALIGN(new_size, fs_info->sectorsize),
 					(u64)-1, 0);
 
-		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
-						 &control);
+		ret = btrfs_truncate_inode_items(trans, root, &control);
 
 		inode_sub_bytes(inode, control.sub_bytes);
 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8c099cd40956..2c87626561aa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4107,8 +4107,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
 	int ret;
 
 	do {
-		ret = btrfs_truncate_inode_items(trans, log_root, inode,
-						 &control);
+		ret = btrfs_truncate_inode_items(trans, log_root, &control);
 	} while (ret == -EAGAIN);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 56e1edb0e3334db479b8400abc5a9c03602e5ae8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:16 -0500
Subject: btrfs: convert BUG_ON() in btrfs_truncate_inode_items to ASSERT

We have a correctness BUG_ON() in btrfs_truncate_inode_items to make
sure that we're always using min_type == BTRFS_EXTENT_DATA_KEY if
new_size is > 0.  Convert this to an ASSERT.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 44db08e46a49..9356edb75ab4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -476,7 +476,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	bool should_throttle = false;
 
 	ASSERT(control->inode || !control->clear_extent_range);
-	BUG_ON(new_size > 0 && control->min_type != BTRFS_EXTENT_DATA_KEY);
+	ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY);
 
 	control->last_size = new_size;
 	control->sub_bytes = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 376b91d5702f8db14beb55148039684f808e0693 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:17 -0500
Subject: btrfs: convert BUG() for pending_del_nr into an ASSERT

This is a logic correctness check, convert it into an ASSERT() instead
of a BUG().

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 9356edb75ab4..f7b02abb681b 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -651,6 +651,9 @@ delete:
 		else
 			control->last_size = new_size;
 		if (del_item) {
+			ASSERT(!pending_del_nr ||
+			       ((path->slots[0] + 1) == pending_del_slot));
+
 			if (!pending_del_nr) {
 				/* No pending yet, add ourselves */
 				pending_del_slot = path->slots[0];
@@ -660,8 +663,6 @@ delete:
 				/* Hop on the pending chunk */
 				pending_del_nr++;
 				pending_del_slot = path->slots[0];
-			} else {
-				BUG();
 			}
 		} else {
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 0adbc6190c34b2e39f67b086cefc260e30045f48 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:18 -0500
Subject: btrfs: combine extra if statements in btrfs_truncate_inode_items

We have

    if (del_item)
	    // do something
    else
	    // something else
    if (del_item)
	    // do yet another thing
    else
	    // something else entirely

back to back in btrfs_truncate_inode_items, collapse these two sets of
if statements into one.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index f7b02abb681b..ef1a9efe08c0 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -646,14 +646,11 @@ delete:
 			}
 		}
 
-		if (del_item)
-			control->last_size = found_key.offset;
-		else
-			control->last_size = new_size;
 		if (del_item) {
 			ASSERT(!pending_del_nr ||
 			       ((path->slots[0] + 1) == pending_del_slot));
 
+			control->last_size = found_key.offset;
 			if (!pending_del_nr) {
 				/* No pending yet, add ourselves */
 				pending_del_slot = path->slots[0];
@@ -665,6 +662,7 @@ delete:
 				pending_del_slot = path->slots[0];
 			}
 		} else {
+			control->last_size = new_size;
 			break;
 		}
 		should_throttle = false;
-- 
cgit v1.2.3-59-g8ed1b


From e48dac7f6f4c0410aa38dbd59b991f2b24b286ae Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:19 -0500
Subject: btrfs: make should_throttle loop local in btrfs_truncate_inode_items

We reset this bool on every loop through the truncate loop, make this
variable local to the loop.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode-item.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index ef1a9efe08c0..0eeb5ea87894 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -473,7 +473,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 bytes_deleted = 0;
 	bool be_nice = false;
-	bool should_throttle = false;
 
 	ASSERT(control->inode || !control->clear_extent_range);
 	ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY);
@@ -523,6 +522,7 @@ search_again:
 
 	while (1) {
 		u64 clear_start = 0, clear_len = 0, extent_start = 0;
+		bool should_throttle = false;
 
 		fi = NULL;
 		leaf = path->nodes[0];
@@ -665,7 +665,6 @@ delete:
 			control->last_size = new_size;
 			break;
 		}
-		should_throttle = false;
 
 		if (del_item && extent_start != 0 && !control->skip_ref_updates) {
 			struct btrfs_ref ref = { 0 };
-- 
cgit v1.2.3-59-g8ed1b


From 8697b8f88e2a750685b73c16c2a1bd438795ae90 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 Dec 2021 17:18:20 -0500
Subject: btrfs: do not check -EAGAIN when truncating inodes in the log root

We only throttle the btrfs_truncate_inode_items if the root is
SHAREABLE, which isn't set on the log root, which means this loop is
unnecessary.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2c87626561aa..c1ddbe800897 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4104,13 +4104,8 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
 		.min_type = min_type,
 		.skip_ref_updates = true,
 	};
-	int ret;
-
-	do {
-		ret = btrfs_truncate_inode_items(trans, log_root, &control);
-	} while (ret == -EAGAIN);
 
-	return ret;
+	return btrfs_truncate_inode_items(trans, log_root, &control);
 }
 
 static void fill_inode_item(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3-59-g8ed1b


From 0bb3acdc48243c0db9fd93fc904ce81bdb4049c9 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 6 Dec 2021 13:52:57 +0800
Subject: btrfs: update SCRUB_MAX_PAGES_PER_BLOCK

Use BTRFS_MAX_METADATA_BLOCKSIZE and SZ_4K (minimal sectorsize) to
calculate this value.

And remove one stale comment on the value, in fact with recent subpage
support, BTRFS_MAX_METADATA_BLOCKSIZE * PAGE_SIZE is already beyond
BTRFS_STRIPE_LEN, just we don't use the full page.

Also since we're here, update the BUG_ON() related to
SCRUB_MAX_PAGES_PER_BLOCK to ASSERT().

As those ASSERT() are really only for developers to catch early obvious
bugs, not to let end users suffer.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 15a123e67108..0870d8db92cd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -49,11 +49,10 @@ struct scrub_ctx;
 #define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
 
 /*
- * the following value times PAGE_SIZE needs to be large enough to match the
+ * The following value times PAGE_SIZE needs to be large enough to match the
  * largest node/leaf/sector size that shall be supported.
- * Values larger than BTRFS_STRIPE_LEN are not supported.
  */
-#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
+#define SCRUB_MAX_PAGES_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
 
 struct scrub_recover {
 	refcount_t		refs;
@@ -1313,7 +1312,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 		recover->bioc = bioc;
 		recover->map_length = mapped_length;
 
-		BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
 
 		nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
 
@@ -2297,7 +2296,7 @@ leave_nomem:
 			scrub_block_put(sblock);
 			return -ENOMEM;
 		}
-		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
 		scrub_page_get(spage);
 		sblock->pagev[index] = spage;
 		spage->sblock = sblock;
@@ -2631,7 +2630,7 @@ leave_nomem:
 			scrub_block_put(sblock);
 			return -ENOMEM;
 		}
-		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
 		/* For scrub block */
 		scrub_page_get(spage);
 		sblock->pagev[index] = spage;
-- 
cgit v1.2.3-59-g8ed1b


From c9d328c0c4b0dc2e27c219d041855e74bb073ab0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 6 Dec 2021 13:52:58 +0800
Subject: btrfs: scrub: merge SCRUB_PAGES_PER_RD_BIO and SCRUB_PAGES_PER_WR_BIO

These two values were introduced in commit ff023aac3119 ("Btrfs: add code
to scrub to copy read data to another disk") as an optimization.

But the truth is, block layer scheduler can do whatever it wants to
merge/split bios to improve performance.

Doing such "optimization" is not really going to affect much, especially
considering how good current block layer optimizations are doing.
Remove such old and immature optimization from our code.

Since we're here, also change BUG_ON()s using these two macros to use
ASSERT()s.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0870d8db92cd..4afa5ae17989 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -39,14 +39,14 @@ struct scrub_block;
 struct scrub_ctx;
 
 /*
- * the following three values only influence the performance.
+ * The following three values only influence the performance.
+ *
  * The last one configures the number of parallel and outstanding I/O
- * operations. The first two values configure an upper limit for the number
+ * operations. The first one configures an upper limit for the number
  * of (dynamically allocated) pages that are added to a bio.
  */
-#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
-#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
-#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
+#define SCRUB_PAGES_PER_BIO	32	/* 128KiB per bio for x86 */
+#define SCRUB_BIOS_PER_SCTX	64	/* 8MiB per device in flight for x86 */
 
 /*
  * The following value times PAGE_SIZE needs to be large enough to match the
@@ -87,11 +87,7 @@ struct scrub_bio {
 	blk_status_t		status;
 	u64			logical;
 	u64			physical;
-#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
-	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
-#else
-	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
-#endif
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO];
 	int			page_count;
 	int			next_free;
 	struct btrfs_work	work;
@@ -162,7 +158,7 @@ struct scrub_ctx {
 	struct list_head	csum_list;
 	atomic_t		cancel_req;
 	int			readonly;
-	int			pages_per_rd_bio;
+	int			pages_per_bio;
 
 	/* State of IO submission throttling affecting the associated device */
 	ktime_t			throttle_deadline;
@@ -173,7 +169,6 @@ struct scrub_ctx {
 
 	struct scrub_bio        *wr_curr_bio;
 	struct mutex            wr_lock;
-	int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 	struct btrfs_device     *wr_tgtdev;
 	bool                    flush_all_writes;
 
@@ -577,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 		goto nomem;
 	refcount_set(&sctx->refs, 1);
 	sctx->is_dev_replace = is_dev_replace;
-	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+	sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
 	sctx->curr = -1;
 	sctx->fs_info = fs_info;
 	INIT_LIST_HEAD(&sctx->csum_list);
@@ -615,7 +610,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 	sctx->wr_curr_bio = NULL;
 	if (is_dev_replace) {
 		WARN_ON(!fs_info->dev_replace.tgtdev);
-		sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 		sctx->flush_all_writes = false;
 	}
@@ -1674,7 +1668,7 @@ again:
 		sbio->dev = sctx->wr_tgtdev;
 		bio = sbio->bio;
 		if (!bio) {
-			bio = btrfs_bio_alloc(sctx->pages_per_wr_bio);
+			bio = btrfs_bio_alloc(sctx->pages_per_bio);
 			sbio->bio = bio;
 		}
 
@@ -1707,7 +1701,7 @@ again:
 	sbio->pagev[sbio->page_count] = spage;
 	scrub_page_get(spage);
 	sbio->page_count++;
-	if (sbio->page_count == sctx->pages_per_wr_bio)
+	if (sbio->page_count == sctx->pages_per_bio)
 		scrub_wr_submit(sctx);
 	mutex_unlock(&sctx->wr_lock);
 
@@ -1754,7 +1748,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
 	struct scrub_ctx *sctx = sbio->sctx;
 	int i;
 
-	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+	ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
 	if (sbio->status) {
 		struct btrfs_dev_replace *dev_replace =
 			&sbio->sctx->fs_info->dev_replace;
@@ -2100,7 +2094,7 @@ again:
 		sbio->dev = spage->dev;
 		bio = sbio->bio;
 		if (!bio) {
-			bio = btrfs_bio_alloc(sctx->pages_per_rd_bio);
+			bio = btrfs_bio_alloc(sctx->pages_per_bio);
 			sbio->bio = bio;
 		}
 
@@ -2134,7 +2128,7 @@ again:
 	scrub_block_get(sblock); /* one for the page added to the bio */
 	atomic_inc(&sblock->outstanding_pages);
 	sbio->page_count++;
-	if (sbio->page_count == sctx->pages_per_rd_bio)
+	if (sbio->page_count == sctx->pages_per_bio)
 		scrub_submit(sctx);
 
 	return 0;
@@ -2368,7 +2362,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 	struct scrub_ctx *sctx = sbio->sctx;
 	int i;
 
-	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+	ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
 	if (sbio->status) {
 		for (i = 0; i < sbio->page_count; i++) {
 			struct scrub_page *spage = sbio->pagev[i];
-- 
cgit v1.2.3-59-g8ed1b


From 9506f9538206acb9a3c24d1ffbd587733b185732 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 2 Dec 2021 15:34:31 -0500
Subject: btrfs: include the free space tree in the global rsv minimum
 calculation

Filipe reported a problem where generic/619 was failing with an ENOSPC
abort while running delayed refs, like the following

  BTRFS: Transaction aborted (error -28)
  WARNING: CPU: 3 PID: 522920 at fs/btrfs/free-space-tree.c:1049 add_to_free_space_tree+0xe5/0x110 [btrfs]
  CPU: 3 PID: 522920 Comm: kworker/u16:19 Tainted: G        W         5.16.0-rc2-btrfs-next-106 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
  RIP: 0010:add_to_free_space_tree+0xe5/0x110 [btrfs]
  RSP: 0000:ffffa65087fb7b20 EFLAGS: 00010282
  RAX: 0000000000000000 RBX: 0000000000001000 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffffffff9131eeaa RDI: 00000000ffffffff
  RBP: ffff8d62e26481b8 R08: ffffffff9ad97ce0 R09: 0000000000000001
  R10: 0000000000000000 R11: 0000000000000001 R12: 00000000ffffffe4
  R13: ffff8d61c25fe688 R14: ffff8d61ebd88800 R15: ffff8d61ebd88a90
  FS:  0000000000000000(0000) GS:ffff8d64ed400000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fa46a8b1000 CR3: 0000000148d18003 CR4: 0000000000370ee0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <TASK>
   __btrfs_free_extent+0x516/0x950 [btrfs]
   __btrfs_run_delayed_refs+0x2b1/0x1250 [btrfs]
   btrfs_run_delayed_refs+0x86/0x210 [btrfs]
   flush_space+0x403/0x630 [btrfs]
   ? call_rcu_tasks_generic+0x50/0x80
   ? lock_release+0x223/0x4a0
   ? btrfs_get_alloc_profile+0xb5/0x290 [btrfs]
   ? do_raw_spin_unlock+0x4b/0xa0
   btrfs_async_reclaim_metadata_space+0x139/0x320 [btrfs]
   process_one_work+0x24c/0x5b0
   worker_thread+0x55/0x3c0
   ? process_one_work+0x5b0/0x5b0
   kthread+0x17c/0x1a0
   ? set_kthread_struct+0x40/0x40
   ret_from_fork+0x22/0x30

There's a couple of reasons for this, but in generic/619's case the
largest reason is because it is a very small file system, ad we do not
reserve enough space for the global reserve.

With the free space tree we now have the free space tree that we need to
modify when running delayed refs.  This means we need the global reserve
to take this into account when it calculates the minimum size it needs
to be.  This is especially important for very small file systems.

Fix this by adjusting the minimum global block rsv size math to include
the size of the free space tree when calculating the size.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 21ac60ec19f6..b3086f252ad0 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -352,25 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 	struct btrfs_space_info *sinfo = block_rsv->space_info;
-	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, 0);
-	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, 0);
-	u64 num_bytes;
-	unsigned min_items;
+	struct btrfs_root *root, *tmp;
+	u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
+	unsigned int min_items = 1;
 
 	/*
 	 * The global block rsv is based on the size of the extent tree, the
 	 * checksum tree and the root tree.  If the fs is empty we want to set
 	 * it to a minimal amount for safety.
+	 *
+	 * We also are going to need to modify the minimum of the tree root and
+	 * any global roots we could touch.
 	 */
-	num_bytes = btrfs_root_used(&extent_root->root_item) +
-		btrfs_root_used(&csum_root->root_item) +
-		btrfs_root_used(&fs_info->tree_root->root_item);
-
-	/*
-	 * We at a minimum are going to modify the csum root, the tree root, and
-	 * the extent root.
-	 */
-	min_items = 3;
+	read_lock(&fs_info->global_root_lock);
+	rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
+					     rb_node) {
+		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+		    root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
+		    root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+			num_bytes += btrfs_root_used(&root->root_item);
+			min_items++;
+		}
+	}
+	read_unlock(&fs_info->global_root_lock);
 
 	/*
 	 * But we also want to reserve enough space so we can do the fallback
-- 
cgit v1.2.3-59-g8ed1b


From c18e3235646a8ba74d013067a6475c8d262d3776 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 2 Dec 2021 15:34:32 -0500
Subject: btrfs: reserve extra space for the free space tree

Filipe reported a problem where sometimes he'd get an ENOSPC abort when
running delayed refs with generic/619 and the free space tree enabled.
This is partly because we do not reserve space for modifying the free
space tree, nor do we have a block rsv associated with that tree.

The delayed_refs_rsv tracks the amount of space required to run delayed
refs.  This means 1 modification means 1 change to the extent root.
With the free space tree this turns into 2 changes, because modifying 1
extent means updating the extent tree and potentially updating the free
space tree to either remove that entry or add the free space.  Thus if
we have the FST enabled, simply double the reservation size for our
modification.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-rsv.c   |  1 +
 fs/btrfs/delayed-ref.c | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index b3086f252ad0..b3ee49b0b1e8 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -426,6 +426,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
 	switch (root->root_key.objectid) {
 	case BTRFS_CSUM_TREE_OBJECTID:
 	case BTRFS_EXTENT_TREE_OBJECTID:
+	case BTRFS_FREE_SPACE_TREE_OBJECTID:
 		root->block_rsv = &fs_info->delayed_refs_rsv;
 		break;
 	case BTRFS_ROOT_TREE_OBJECTID:
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index da9d20813147..4176df149d04 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -84,6 +84,17 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
 	u64 released = 0;
 
+	/*
+	 * We have to check the mount option here because we could be enabling
+	 * the free space tree for the first time and don't have the compat_ro
+	 * option set yet.
+	 *
+	 * We need extra reservations if we have the free space tree because
+	 * we'll have to modify that tree as well.
+	 */
+	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+		num_bytes *= 2;
+
 	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
 	if (released)
 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
@@ -108,6 +119,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 
 	num_bytes = btrfs_calc_insert_metadata_size(fs_info,
 						    trans->delayed_ref_updates);
+	/*
+	 * We have to check the mount option here because we could be enabling
+	 * the free space tree for the first time and don't have the compat_ro
+	 * option set yet.
+	 *
+	 * We need extra reservations if we have the free space tree because
+	 * we'll have to modify that tree as well.
+	 */
+	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+		num_bytes *= 2;
+
 	spin_lock(&delayed_rsv->lock);
 	delayed_rsv->size += num_bytes;
 	delayed_rsv->full = 0;
-- 
cgit v1.2.3-59-g8ed1b


From a26d60dedf9af81df7aeaef66353eb391f85e7a9 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 21 Oct 2021 23:31:17 +0800
Subject: btrfs: sysfs: add devinfo/fsid to retrieve actual fsid from the
 device

In the case of the seed device, the fsid can be different from the mounted
sprout fsid.  The userland has to read the device superblock to know the
fsid but, that idea fails if the device is missing. So add a sysfs
interface devinfo/<devid>/fsid to show the fsid of the device.

For example:
  $ cd /sys/fs/btrfs/b10b02a5-f9de-4276-b9e8-2bfd09a578a8

  $ cat devinfo/1/fsid
  c44d771f-639d-4df3-99ec-5bc7ad2af93b
  $ cat  devinfo/3/fsid
  b10b02a5-f9de-4276-b9e8-2bfd09a578a8

Though it's related to seeding, the name of the sysfs file is plain fsid as it
matches what blkid says.  A path to the device's fsid will aid scripting.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f9eff3b0f77c..beb7f72d50b8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1537,6 +1537,16 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
 }
 BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
 
+static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+						   devid_kobj);
+
+	return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid);
+}
+BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show);
+
 static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
 		struct kobj_attribute *a, char *buf)
 {
@@ -1572,6 +1582,7 @@ BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
  */
 static struct attribute *devid_attrs[] = {
 	BTRFS_ATTR_PTR(devid, error_stats),
+	BTRFS_ATTR_PTR(devid, fsid),
 	BTRFS_ATTR_PTR(devid, in_fs_metadata),
 	BTRFS_ATTR_PTR(devid, missing),
 	BTRFS_ATTR_PTR(devid, replace_target),
-- 
cgit v1.2.3-59-g8ed1b


From 869f4cdc73f9378986755030c684c011f0b71517 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 7 Dec 2021 06:28:34 -0800
Subject: btrfs: zoned: encapsulate inode locking for zoned relocation

Encapsulate the inode lock needed for serializing the data relocation
writes on a zoned filesystem into a helper.

This streamlines the code reading flow and hides special casing for
zoned filesystems.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c |  8 ++------
 fs/btrfs/zoned.h     | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dee4f043c7d8..e367b4c82cec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5184,8 +5184,6 @@ int extent_writepages(struct address_space *mapping,
 		      struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
-	const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
 	int ret = 0;
 	struct extent_page_data epd = {
 		.bio_ctrl = { 0 },
@@ -5197,11 +5195,9 @@ int extent_writepages(struct address_space *mapping,
 	 * Allow only a single thread to do the reloc work in zoned mode to
 	 * protect the write pointer updates.
 	 */
-	if (data_reloc && zoned)
-		btrfs_inode_lock(inode, 0);
+	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
 	ret = extent_write_cache_pages(mapping, wbc, &epd);
-	if (data_reloc && zoned)
-		btrfs_inode_unlock(inode, 0);
+	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
 	ASSERT(ret <= 0);
 	if (ret < 0) {
 		end_write_bio(&epd, ret);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 4344f4818389..a7b4cd6dd9f4 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -8,6 +8,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "block-group.h"
+#include "btrfs_inode.h"
 
 /*
  * Block groups with more than this value (percents) of unusable space will be
@@ -354,4 +355,20 @@ static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
 	spin_unlock(&fs_info->treelog_bg_lock);
 }
 
+static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
+{
+	struct btrfs_root *root = inode->root;
+
+	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+		btrfs_inode_lock(&inode->vfs_inode, 0);
+}
+
+static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
+{
+	struct btrfs_root *root = inode->root;
+
+	if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+		btrfs_inode_unlock(&inode->vfs_inode, 0);
+}
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 8fdf54fe69a7a0f11542c2dd322b590a5b935918 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 7 Dec 2021 06:28:35 -0800
Subject: btrfs: zoned: simplify btrfs_check_meta_write_pointer

btrfs_check_meta_write_pointer() will always be called with a NULL
'cache_ret' argument.

As there's no need to check if we have a valid block_group passed in
remove these checks.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zoned.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 5ef430f02fd2..767ff6cffeae 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1637,29 +1637,19 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
 	if (!btrfs_is_zoned(fs_info))
 		return true;
 
-	cache = *cache_ret;
+	cache = btrfs_lookup_block_group(fs_info, eb->start);
+	if (!cache)
+		return true;
 
-	if (cache && (eb->start < cache->start ||
-		      cache->start + cache->length <= eb->start)) {
+	if (cache->meta_write_pointer != eb->start) {
 		btrfs_put_block_group(cache);
 		cache = NULL;
-		*cache_ret = NULL;
+		ret = false;
+	} else {
+		cache->meta_write_pointer = eb->start + eb->len;
 	}
 
-	if (!cache)
-		cache = btrfs_lookup_block_group(fs_info, eb->start);
-
-	if (cache) {
-		if (cache->meta_write_pointer != eb->start) {
-			btrfs_put_block_group(cache);
-			cache = NULL;
-			ret = false;
-		} else {
-			cache->meta_write_pointer = eb->start + eb->len;
-		}
-
-		*cache_ret = cache;
-	}
+	*cache_ret = cache;
 
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 554aed7da29bcadb3ee3cfdc1376da660d3fc849 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 7 Dec 2021 06:28:36 -0800
Subject: btrfs: zoned: sink zone check into btrfs_repair_one_zone

Sink zone check into btrfs_repair_one_zone() so we don't need to do it
in all callers.

Also as btrfs_repair_one_zone() doesn't return a sensible error, make it
a boolean function and return false in case it got called on a non-zoned
filesystem and true on a zoned filesystem.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c |  4 ++--
 fs/btrfs/scrub.c     |  4 ++--
 fs/btrfs/volumes.c   | 13 ++++++++-----
 fs/btrfs/volumes.h   |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e367b4c82cec..790d3255be09 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2314,8 +2314,8 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
 	BUG_ON(!mirror_num);
 
-	if (btrfs_is_zoned(fs_info))
-		return btrfs_repair_one_zone(fs_info, logical);
+	if (btrfs_repair_one_zone(fs_info, logical))
+		return 0;
 
 	bio = btrfs_bio_alloc(1);
 	bio->bi_iter.bi_size = 0;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4afa5ae17989..f28ae41b1df3 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -845,8 +845,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	have_csum = sblock_to_check->pagev[0]->have_csum;
 	dev = sblock_to_check->pagev[0]->dev;
 
-	if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
-		return btrfs_repair_one_zone(fs_info, logical);
+	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
+		return 0;
 
 	/*
 	 * We must use GFP_NOFS because the scrub task might be waiting for a
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9db9d66b5470..f8be80aad8f0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -8341,23 +8341,26 @@ out:
 	return ret;
 }
 
-int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
 {
 	struct btrfs_block_group *cache;
 
+	if (!btrfs_is_zoned(fs_info))
+		return false;
+
 	/* Do not attempt to repair in degraded state */
 	if (btrfs_test_opt(fs_info, DEGRADED))
-		return 0;
+		return true;
 
 	cache = btrfs_lookup_block_group(fs_info, logical);
 	if (!cache)
-		return 0;
+		return true;
 
 	spin_lock(&cache->lock);
 	if (cache->relocating_repair) {
 		spin_unlock(&cache->lock);
 		btrfs_put_block_group(cache);
-		return 0;
+		return true;
 	}
 	cache->relocating_repair = 1;
 	spin_unlock(&cache->lock);
@@ -8365,5 +8368,5 @@ int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
 	kthread_run(relocating_repair_kthread, cache,
 		    "btrfs-relocating-repair");
 
-	return 0;
+	return true;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3b8130680749..9cf1d93a3d66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -637,6 +637,6 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags
 int btrfs_bg_type_to_factor(u64 flags);
 const char *btrfs_bg_type_to_raid_name(u64 flags);
 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
-int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 736727100067267708d221632ffff4083fc0b278 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 7 Dec 2021 06:28:37 -0800
Subject: btrfs: zoned: drop redundant check for REQ_OP_ZONE_APPEND and
 btrfs_is_zoned

REQ_OP_ZONE_APPEND can only work on zoned devices, so it is redundant to
check if the filesystem is zoned when REQ_OP_ZONE_APPEND is set as the
bio's bio_op.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 790d3255be09..38c5e9eb9a10 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3283,8 +3283,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
 	else
 		bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
 
-	if (!btrfs_is_zoned(fs_info) ||
-	    bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+	if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
 		bio_ctrl->len_to_oe_boundary = U32_MAX;
 		return 0;
 	}
@@ -3339,7 +3338,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
 		bio_set_dev(bio, bdev);
 		wbc_init_bio(wbc, bio);
 	}
-	if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
 		struct btrfs_device *device;
 
 		device = btrfs_zoned_get_device(fs_info, disk_bytenr,
-- 
cgit v1.2.3-59-g8ed1b


From 1ada69f61c88abb75a1038ee457633325658a183 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 8 Dec 2021 00:35:47 +0900
Subject: btrfs: zoned: unset dedicated block group on allocation failure

Allocating an extent from a block group can fail for various reasons.
When an allocation from a dedicated block group (for tree-log or
relocation data) fails, we need to unregister it as a dedicated one so
that we can allocate a new block group for the dedicated one.

However, we are returning early when the block group in case it is
read-only, fully used, or not be able to activate the zone. As a result,
we keep the non-usable block group as a dedicated one, leading to
further allocation failure. With many block groups, the allocator will
iterate hopeless loop to find a free extent, results in a hung task.

Fix the issue by delaying the return and doing the proper cleanups.

CC: stable@vger.kernel.org # 5.16
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7f08c9e8eb28..5362b54e3871 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3805,23 +3805,35 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
 	spin_unlock(&fs_info->relocation_bg_lock);
 	if (skip)
 		return 1;
+
 	/* Check RO and no space case before trying to activate it */
 	spin_lock(&block_group->lock);
 	if (block_group->ro ||
 	    block_group->alloc_offset == block_group->zone_capacity) {
-		spin_unlock(&block_group->lock);
-		return 1;
+		ret = 1;
+		/*
+		 * May need to clear fs_info->{treelog,data_reloc}_bg.
+		 * Return the error after taking the locks.
+		 */
 	}
 	spin_unlock(&block_group->lock);
 
-	if (!btrfs_zone_activate(block_group))
-		return 1;
+	if (!ret && !btrfs_zone_activate(block_group)) {
+		ret = 1;
+		/*
+		 * May need to clear fs_info->{treelog,data_reloc}_bg.
+		 * Return the error after taking the locks.
+		 */
+	}
 
 	spin_lock(&space_info->lock);
 	spin_lock(&block_group->lock);
 	spin_lock(&fs_info->treelog_bg_lock);
 	spin_lock(&fs_info->relocation_bg_lock);
 
+	if (ret)
+		goto out;
+
 	ASSERT(!ffe_ctl->for_treelog ||
 	       block_group->start == fs_info->treelog_bg ||
 	       fs_info->treelog_bg == 0);
-- 
cgit v1.2.3-59-g8ed1b


From 50475cd57706359d6cc652be88369dace7a4c2eb Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 8 Dec 2021 00:35:48 +0900
Subject: btrfs: add extent allocator hook to decide to allocate chunk or not

Introduce a new hook for an extent allocator policy. With the new
hook, a policy can decide to allocate a new block group or not. If
not, it will return -ENOSPC, so btrfs_reserve_extent() will cut the
allocation size in half and retry the allocation if min_alloc_size is
large enough.

The hook has a place holder and will be replaced with the real
implementation in the next patch.

CC: stable@vger.kernel.org # 5.16
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5362b54e3871..910bb08faeb8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3974,6 +3974,19 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
 	}
 }
 
+static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
+			       struct find_free_extent_ctl *ffe_ctl)
+{
+	switch (ffe_ctl->policy) {
+	case BTRFS_EXTENT_ALLOC_CLUSTERED:
+		return true;
+	case BTRFS_EXTENT_ALLOC_ZONED:
+		return true;
+	default:
+		BUG();
+	}
+}
+
 static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
 {
 	switch (ffe_ctl->policy) {
@@ -4061,6 +4074,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 			struct btrfs_trans_handle *trans;
 			int exist = 0;
 
+			/*Check if allocation policy allows to create a new chunk */
+			if (!can_allocate_chunk(fs_info, ffe_ctl))
+				return -ENOSPC;
+
 			trans = current->journal_info;
 			if (trans)
 				exist = 1;
-- 
cgit v1.2.3-59-g8ed1b


From 82187d2ecdfb22ab7ee05f388402a39236d31428 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Wed, 8 Dec 2021 00:35:49 +0900
Subject: btrfs: zoned: fix chunk allocation condition for zoned allocator

The ZNS specification defines a limit on the number of "active"
zones. That limit impose us to limit the number of block groups which
can be used for an allocation at the same time. Not to exceed the
limit, we reuse the existing active block groups as much as possible
when we can't activate any other zones without sacrificing an already
activated block group in commit a85f05e59bc1 ("btrfs: zoned: avoid
chunk allocation if active block group has enough space").

However, the check is wrong in two ways. First, it checks the
condition for every raid index (ffe_ctl->index). Even if it reaches
the condition and "ffe_ctl->max_extent_size >=
ffe_ctl->min_alloc_size" is met, there can be other block groups
having enough space to hold ffe_ctl->num_bytes. (Actually, this won't
happen in the current zoned code as it only supports SINGLE
profile. But, it can happen once it enables other RAID types.)

Second, it checks the active zone availability depending on the
raid index. The raid index is just an index for
space_info->block_groups, so it has nothing to do with chunk allocation.

These mistakes are causing a faulty allocation in a certain
situation. Consider we are running zoned btrfs on a device whose
max_active_zone == 0 (no limit). And, suppose no block group have a
room to fit ffe_ctl->num_bytes but some room to meet
ffe_ctl->min_alloc_size (i.e. max_extent_size > num_bytes >=
min_alloc_size).

In this situation, the following occur:

- With SINGLE raid_index, it reaches the chunk allocation checking
  code
- The check returns true because we can activate a new zone (no limit)
- But, before allocating the chunk, it iterates to the next raid index
  (RAID5)
- Since there are no RAID5 block groups on zoned mode, it again
  reaches the check code
- The check returns false because of btrfs_can_activate_zone()'s "if
  (raid_index != BTRFS_RAID_SINGLE)" part
- That results in returning -ENOSPC without allocating a new chunk

As a result, we end up hitting -ENOSPC too early.

Move the check to the right place in the can_allocate_chunk() hook,
and do the active zone check depending on the allocation flag, not on
the raid index.

CC: stable@vger.kernel.org # 5.16
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 21 +++++++++------------
 fs/btrfs/zoned.c       |  5 ++---
 fs/btrfs/zoned.h       |  5 ++---
 3 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 910bb08faeb8..d89273c4b6b8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3981,6 +3981,15 @@ static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
 	case BTRFS_EXTENT_ALLOC_CLUSTERED:
 		return true;
 	case BTRFS_EXTENT_ALLOC_ZONED:
+		/*
+		 * If we have enough free space left in an already
+		 * active block group and we can't activate any other
+		 * zone now, do not allow allocating a new chunk and
+		 * let find_free_extent() retry with a smaller size.
+		 */
+		if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+		    !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+			return false;
 		return true;
 	default:
 		BUG();
@@ -4027,18 +4036,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 		return 0;
 	}
 
-	if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
-	    !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
-		/*
-		 * If we have enough free space left in an already active block
-		 * group and we can't activate any other zone now, retry the
-		 * active ones with a smaller allocation size.  Returning early
-		 * from here will tell btrfs_reserve_extent() to haven the
-		 * size.
-		 */
-		return -ENOSPC;
-	}
-
 	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
 		return 1;
 
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 767ff6cffeae..f559d517c7c4 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1925,7 +1925,7 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
 	return ret;
 }
 
-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
 {
 	struct btrfs_device *device;
 	bool ret = false;
@@ -1934,8 +1934,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index
 		return true;
 
 	/* Non-single profiles are not supported yet */
-	if (raid_index != BTRFS_RAID_SINGLE)
-		return false;
+	ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0);
 
 	/* Check if there is a device with active zones left */
 	mutex_lock(&fs_devices->device_list_mutex);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index a7b4cd6dd9f4..cbf016a7bb5d 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -73,8 +73,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
 					    u64 logical, u64 length);
 bool btrfs_zone_activate(struct btrfs_block_group *block_group);
 int btrfs_zone_finish(struct btrfs_block_group *block_group);
-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
-			     int raid_index);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
 			     u64 length);
 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
@@ -226,7 +225,7 @@ static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
 }
 
 static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
-					   int raid_index)
+					   u64 flags)
 {
 	return true;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1b58ae0e4d3ede95ee968a3f097298a57cf711ba Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 13 Dec 2021 08:45:14 +0000
Subject: btrfs: skip transaction commit after failure to create subvolume

At ioctl.c:create_subvol(), when we fail to create a subvolume we always
commit the transaction. In most cases this is a no-op, since all the error
paths, except for one, abort the transaction - the only exception is when
we fail to insert the new root item into the root tree, in that case we
don't abort the transaction because we didn't do anything that is
irreversible - however we end up committing the transaction which although
is not a functional problem, it adds unnecessary rotation of the backup
roots in the superblock and unnecessary work.

So change that to commit a transaction only when no error happened,
otherwise just call btrfs_end_transaction() to release our reference on
the transaction.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7565b667f4fc..a5bd6926f7ff 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -544,7 +544,6 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
 	struct timespec64 cur_time = current_time(dir);
 	struct inode *inode;
 	int ret;
-	int err;
 	dev_t anon_dev = 0;
 	u64 objectid;
 	u64 index = 0;
@@ -724,9 +723,10 @@ fail:
 	trans->bytes_reserved = 0;
 	btrfs_subvolume_release_metadata(root, &block_rsv);
 
-	err = btrfs_commit_transaction(trans);
-	if (err && !ret)
-		ret = err;
+	if (ret)
+		btrfs_end_transaction(trans);
+	else
+		ret = btrfs_commit_transaction(trans);
 
 	if (!ret) {
 		inode = btrfs_lookup_dentry(dir, dentry);
-- 
cgit v1.2.3-59-g8ed1b


From c1227996438050ee1a592db40404a088a205e66f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 14 Dec 2021 15:39:39 +0200
Subject: btrfs: refactor unlock_up

The purpose of this function is to unlock all nodes in a btrfs path
which are above 'lowest_unlock' and whose slot used is different than 0.
As such it used slightly awkward structure of 'if' as well as somewhat
cryptic "no_skip" control variable which denotes whether we should
check the current level of skipability or no.

This patch does the following (cosmetic) refactorings:

* Renames 'no_skip' to 'check_skip' and makes it a boolean. This
  variable controls whether we are below the lowest_unlock/skip_level
  levels.

* Consolidates the 2 conditions which warrant checking whether the
  current level should be skipped under 1 common if (check_skip) branch,
  this increase indentation level but is not critical.

* Consolidates the 'skip_level < i && i >= lowest_unlock' and
  'i >= lowest_unlock && i > skip_level' condition into a common branch
  since those are identical.

* Eliminates the local extent_buffer variable as in this case it doesn't
  bring anything to function readability.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index defc494a2d08..a7db3f6f1b7b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1349,33 +1349,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 {
 	int i;
 	int skip_level = level;
-	int no_skips = 0;
-	struct extent_buffer *t;
+	bool check_skip = true;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
 		if (!path->nodes[i])
 			break;
 		if (!path->locks[i])
 			break;
-		if (!no_skips && path->slots[i] == 0) {
-			skip_level = i + 1;
-			continue;
-		}
-		if (!no_skips && path->keep_locks) {
-			u32 nritems;
-			t = path->nodes[i];
-			nritems = btrfs_header_nritems(t);
-			if (nritems < 1 || path->slots[i] >= nritems - 1) {
+
+		if (check_skip) {
+			if (path->slots[i] == 0) {
 				skip_level = i + 1;
 				continue;
 			}
+
+			if (path->keep_locks) {
+				u32 nritems;
+
+				nritems = btrfs_header_nritems(path->nodes[i]);
+				if (nritems < 1 || path->slots[i] >= nritems - 1) {
+					skip_level = i + 1;
+					continue;
+				}
+			}
 		}
-		if (skip_level < i && i >= lowest_unlock)
-			no_skips = 1;
 
-		t = path->nodes[i];
 		if (i >= lowest_unlock && i > skip_level) {
-			btrfs_tree_unlock_rw(t, path->locks[i]);
+			check_skip = false;
+			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
 			path->locks[i] = 0;
 			if (write_lock_level &&
 			    i > min_write_lock_level &&
-- 
cgit v1.2.3-59-g8ed1b


From 2522dbe86b54ff07eb0819a20534c0d33c5414cd Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 14 Dec 2021 21:01:43 +0800
Subject: btrfs: scrub: remove the unnecessary path parameter for
 scrub_raid56_parity()

In function scrub_stripe() we allocated two btrfs_path's, one @path for
extent tree search and another @ppath for full stripe extent tree search
for RAID56.

This is totally umncessary, as the @ppath usage is completely inside
scrub_raid56_parity(), thus we can move the path allocation into
scrub_raid56_parity() completely.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f28ae41b1df3..92fd2be6cd65 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2885,7 +2885,6 @@ static void scrub_parity_put(struct scrub_parity *sparity)
 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 						  struct map_lookup *map,
 						  struct btrfs_device *sdev,
-						  struct btrfs_path *path,
 						  u64 logic_start,
 						  u64 logic_end)
 {
@@ -2894,6 +2893,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 	struct btrfs_root *csum_root;
 	struct btrfs_extent_item *extent;
 	struct btrfs_io_context *bioc = NULL;
+	struct btrfs_path *path;
 	u64 flags;
 	int ret;
 	int slot;
@@ -2912,6 +2912,16 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 	int extent_mirror_num;
 	int stop_loop = 0;
 
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+
 	ASSERT(map->stripe_len <= U32_MAX);
 	nsectors = map->stripe_len >> fs_info->sectorsize_bits;
 	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
@@ -2921,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
 		spin_unlock(&sctx->stat_lock);
+		btrfs_free_path(path);
 		return -ENOMEM;
 	}
 
@@ -3110,7 +3121,7 @@ out:
 	scrub_wr_submit(sctx);
 	mutex_unlock(&sctx->wr_lock);
 
-	btrfs_release_path(path);
+	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
 
@@ -3160,7 +3171,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   int num, u64 base, u64 length,
 					   struct btrfs_block_group *cache)
 {
-	struct btrfs_path *path, *ppath;
+	struct btrfs_path *path;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_root *root;
 	struct btrfs_root *csum_root;
@@ -3222,12 +3233,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	if (!path)
 		return -ENOMEM;
 
-	ppath = btrfs_alloc_path();
-	if (!ppath) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
-
 	/*
 	 * work on commit root. The related disk blocks are static as
 	 * long as COW is applied. This means, it is save to rewrite
@@ -3236,8 +3241,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
-	ppath->search_commit_root = 1;
-	ppath->skip_locking = 1;
 	/*
 	 * trigger the readahead for extent tree csum tree and wait for
 	 * completion. During readahead, the scrub is officially paused
@@ -3340,7 +3343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 				stripe_logical += base;
 				stripe_end = stripe_logical + increment;
 				ret = scrub_raid56_parity(sctx, map, scrub_dev,
-							  ppath, stripe_logical,
+							  stripe_logical,
 							  stripe_end);
 				if (ret)
 					goto out;
@@ -3511,7 +3514,7 @@ loop:
 						stripe_end = stripe_logical +
 								increment;
 						ret = scrub_raid56_parity(sctx,
-							map, scrub_dev, ppath,
+							map, scrub_dev,
 							stripe_logical,
 							stripe_end);
 						if (ret)
@@ -3558,7 +3561,6 @@ out:
 
 	blk_finish_plug(&plug);
 	btrfs_free_path(path);
-	btrfs_free_path(ppath);
 
 	if (sctx->is_dev_replace && ret >= 0) {
 		int ret2;
-- 
cgit v1.2.3-59-g8ed1b


From dcf62b204c06ac22a988c8563b20e479d206befc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 14 Dec 2021 21:01:44 +0800
Subject: btrfs: scrub: use btrfs_path::reada for extent tree readahead

For scrub, we trigger two readaheads for two trees, extent tree to get
where to scrub, and csum tree to get the data checksum.

For csum tree we already trigger readahead in
btrfs_lookup_csums_range(), by setting path->reada.
But for extent tree we don't have any path based readahead.

Add the readahead for extent tree as well, so we can later remove the
btrfs_reada_add() based readahead.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 92fd2be6cd65..ad10a5e70e73 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3240,6 +3240,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	 */
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
+	path->reada = READA_FORWARD;
 
 	/*
 	 * trigger the readahead for extent tree csum tree and wait for
-- 
cgit v1.2.3-59-g8ed1b


From f26c92386028563a1e988bb277c6c5ce2e8010ee Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 14 Dec 2021 21:01:45 +0800
Subject: btrfs: remove reada infrastructure

Currently there is only one user for btrfs metadata readahead, and
that's scrub.

But even for the single user, it's not providing the correct
functionality it needs, as scrub needs reada for commit root, which
current readahead can't provide. (Although it's pretty easy to add such
feature).

Despite this, there are some extra problems related to metadata
readahead:

- Duplicated feature with btrfs_path::reada

- Partly duplicated feature of btrfs_fs_info::buffer_radix
  Btrfs already caches its metadata in buffer_radix, while readahead
  tries to read the tree block no matter if it's already cached.

- Poor layer separation
  Metadata readahead works kinda at device level.
  This is definitely not the correct layer it should be, since metadata
  is at btrfs logical address space, it should not bother device at all.

  This brings extra chance for bugs to sneak in, while brings
  unnecessary complexity.

- Dead code
  In the very beginning of scrub.c we have #undef DEBUG, rendering all
  the debug related code useless and unable to test.

Thus here I purpose to remove the metadata readahead mechanism
completely.

[BENCHMARK]
There is a full benchmark for the scrub performance difference using the
old btrfs_reada_add() and btrfs_path::reada.

For the worst case (no dirty metadata, slow HDD), there could be a 5%
performance drop for scrub.
For other cases (even SATA SSD), there is no distinguishable performance
difference.

The number is reported scrub speed, in MiB/s.
The resolution is limited by the reported duration, which only has a
resolution of 1 second.

	Old		New		Diff
SSD	455.3		466.332		+2.42%
HDD	103.927 	98.012		-5.69%

Comprehensive test methodology is in the cover letter of the patch.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile      |    2 +-
 fs/btrfs/ctree.h       |   25 --
 fs/btrfs/dev-replace.c |    5 -
 fs/btrfs/disk-io.c     |   20 +-
 fs/btrfs/extent_io.c   |    3 -
 fs/btrfs/reada.c       | 1086 ------------------------------------------------
 fs/btrfs/scrub.c       |   36 --
 fs/btrfs/super.c       |    1 -
 fs/btrfs/volumes.c     |    7 -
 fs/btrfs/volumes.h     |    7 -
 10 files changed, 3 insertions(+), 1189 deletions(-)
 delete mode 100644 fs/btrfs/reada.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3dcf9bcc2326..4188ba3fd8c3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -27,7 +27,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
 	   subpage.o tree-mod-log.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 459d00211181..b4a9b1c58d22 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -821,7 +821,6 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *endio_write_workers;
 	struct btrfs_workqueue *endio_freespace_worker;
 	struct btrfs_workqueue *caching_workers;
-	struct btrfs_workqueue *readahead_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
@@ -958,13 +957,6 @@ struct btrfs_fs_info {
 
 	struct btrfs_delayed_root *delayed_root;
 
-	/* readahead tree */
-	spinlock_t reada_lock;
-	struct radix_tree_root reada_tree;
-
-	/* readahead works cnt */
-	atomic_t reada_works_cnt;
-
 	/* Extent buffer radix tree */
 	spinlock_t buffer_lock;
 	/* Entries are eb->start / sectorsize */
@@ -3807,23 +3799,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
 	btrfs_bio_counter_sub(fs_info, 1);
 }
 
-/* reada.c */
-struct reada_control {
-	struct btrfs_fs_info	*fs_info;		/* tree to prefetch */
-	struct btrfs_key	key_start;
-	struct btrfs_key	key_end;	/* exclusive */
-	atomic_t		elems;
-	struct kref		refcnt;
-	wait_queue_head_t	wait;
-};
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
-			      struct btrfs_key *start, struct btrfs_key *end);
-int btrfs_reada_wait(void *handle);
-void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct extent_buffer *eb, int err);
-void btrfs_reada_remove_dev(struct btrfs_device *dev);
-void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
-
 static inline int is_fstree(u64 rootid)
 {
 	if (rootid == BTRFS_FS_TREE_OBJECTID ||
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66fa61cb3f23..62b9651ea662 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -906,9 +906,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	}
 	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 
-	if (!scrub_ret)
-		btrfs_reada_remove_dev(src_device);
-
 	/*
 	 * We have to use this loop approach because at this point src_device
 	 * has to be available for transaction commit to complete, yet new
@@ -917,7 +914,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	while (1) {
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans)) {
-			btrfs_reada_undo_remove_dev(src_device);
 			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 			return PTR_ERR(trans);
 		}
@@ -968,7 +964,6 @@ error:
 		up_write(&dev_replace->rwsem);
 		mutex_unlock(&fs_info->chunk_mutex);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		btrfs_reada_undo_remove_dev(src_device);
 		btrfs_rm_dev_replace_blocked(fs_info);
 		if (tgt_device)
 			btrfs_destroy_dev_replace_tgtdev(tgt_device);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fc7dd5109806..41dea24e2681 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -665,9 +665,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
 	if (ret < 0)
 		goto err;
 
-	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-		btree_readahead_hook(eb, ret);
-
 	set_extent_buffer_uptodate(eb);
 
 	free_extent_buffer(eb);
@@ -715,10 +712,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
 	}
 	ret = validate_extent_buffer(eb);
 err:
-	if (reads_done &&
-	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
-		btree_readahead_hook(eb, ret);
-
 	if (ret) {
 		/*
 		 * our io error hook is going to dec the io pages
@@ -2232,7 +2225,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
 	btrfs_destroy_workqueue(fs_info->delayed_workers);
 	btrfs_destroy_workqueue(fs_info->caching_workers);
-	btrfs_destroy_workqueue(fs_info->readahead_workers);
 	btrfs_destroy_workqueue(fs_info->flush_workers);
 	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
 	if (fs_info->discard_ctl.discard_workers)
@@ -2445,9 +2437,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	fs_info->delayed_workers =
 		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
 				      max_active, 0);
-	fs_info->readahead_workers =
-		btrfs_alloc_workqueue(fs_info, "readahead", flags,
-				      max_active, 2);
 	fs_info->qgroup_rescan_workers =
 		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
 	fs_info->discard_ctl.discard_workers =
@@ -2459,9 +2448,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 	      fs_info->endio_meta_write_workers &&
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-	      fs_info->caching_workers && fs_info->readahead_workers &&
-	      fs_info->fixup_workers && fs_info->delayed_workers &&
-	      fs_info->qgroup_rescan_workers &&
+	      fs_info->caching_workers && fs_info->fixup_workers &&
+	      fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
 	      fs_info->discard_ctl.discard_workers)) {
 		return -ENOMEM;
 	}
@@ -3091,7 +3079,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->defrag_running, 0);
-	atomic_set(&fs_info->reada_works_cnt, 0);
 	atomic_set(&fs_info->nr_delayed_iputs, 0);
 	atomic64_set(&fs_info->tree_mod_seq, 0);
 	fs_info->global_root_tree = RB_ROOT;
@@ -3102,9 +3089,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	fs_info->tree_mod_log = RB_ROOT;
 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
-	/* readahead state */
-	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-	spin_lock_init(&fs_info->reada_lock);
 	btrfs_init_ref_verify(fs_info);
 
 	fs_info->thread_pool_size = min_t(unsigned long,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 38c5e9eb9a10..73b2f1c805c0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3087,9 +3087,6 @@ static void end_bio_extent_readpage(struct bio *bio)
 			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
 			eb->read_mirror = mirror;
 			atomic_dec(&eb->io_pages);
-			if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
-					       &eb->bflags))
-				btree_readahead_hook(eb, -EIO);
 		}
 readpage_ok:
 		if (likely(uptodate)) {
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
deleted file mode 100644
index eb96fdc3be25..000000000000
--- a/fs/btrfs/reada.c
+++ /dev/null
@@ -1,1086 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2011 STRATO.  All rights reserved.
- */
-
-#include <linux/sched.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include "ctree.h"
-#include "volumes.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "dev-replace.h"
-#include "block-group.h"
-
-#undef DEBUG
-
-/*
- * This is the implementation for the generic read ahead framework.
- *
- * To trigger a readahead, btrfs_reada_add must be called. It will start
- * a read ahead for the given range [start, end) on tree root. The returned
- * handle can either be used to wait on the readahead to finish
- * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
- *
- * The read ahead works as follows:
- * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
- * reada_start_machine will then search for extents to prefetch and trigger
- * some reads. When a read finishes for a node, all contained node/leaf
- * pointers that lie in the given range will also be enqueued. The reads will
- * be triggered in sequential order, thus giving a big win over a naive
- * enumeration. It will also make use of multi-device layouts. Each disk
- * will have its on read pointer and all disks will by utilized in parallel.
- * Also will no two disks read both sides of a mirror simultaneously, as this
- * would waste seeking capacity. Instead both disks will read different parts
- * of the filesystem.
- * Any number of readaheads can be started in parallel. The read order will be
- * determined globally, i.e. 2 parallel readaheads will normally finish faster
- * than the 2 started one after another.
- */
-
-#define MAX_IN_FLIGHT 6
-
-struct reada_extctl {
-	struct list_head	list;
-	struct reada_control	*rc;
-	u64			generation;
-};
-
-struct reada_extent {
-	u64			logical;
-	u64			owner_root;
-	struct btrfs_key	top;
-	struct list_head	extctl;
-	int 			refcnt;
-	spinlock_t		lock;
-	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
-	int			nzones;
-	int			scheduled;
-	int			level;
-};
-
-struct reada_zone {
-	u64			start;
-	u64			end;
-	u64			elems;
-	struct list_head	list;
-	spinlock_t		lock;
-	int			locked;
-	struct btrfs_device	*device;
-	struct btrfs_device	*devs[BTRFS_MAX_MIRRORS]; /* full list, incl
-							   * self */
-	int			ndevs;
-	struct kref		refcnt;
-};
-
-struct reada_machine_work {
-	struct btrfs_work	work;
-	struct btrfs_fs_info	*fs_info;
-};
-
-static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
-static void reada_control_release(struct kref *kref);
-static void reada_zone_release(struct kref *kref);
-static void reada_start_machine(struct btrfs_fs_info *fs_info);
-static void __reada_start_machine(struct btrfs_fs_info *fs_info);
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
-			   struct btrfs_key *top, u64 owner_root,
-			   u64 generation, int level);
-
-/* recurses */
-/* in case of err, eb might be NULL */
-static void __readahead_hook(struct btrfs_fs_info *fs_info,
-			     struct reada_extent *re, struct extent_buffer *eb,
-			     int err)
-{
-	int nritems;
-	int i;
-	u64 bytenr;
-	u64 generation;
-	struct list_head list;
-
-	spin_lock(&re->lock);
-	/*
-	 * just take the full list from the extent. afterwards we
-	 * don't need the lock anymore
-	 */
-	list_replace_init(&re->extctl, &list);
-	re->scheduled = 0;
-	spin_unlock(&re->lock);
-
-	/*
-	 * this is the error case, the extent buffer has not been
-	 * read correctly. We won't access anything from it and
-	 * just cleanup our data structures. Effectively this will
-	 * cut the branch below this node from read ahead.
-	 */
-	if (err)
-		goto cleanup;
-
-	/*
-	 * FIXME: currently we just set nritems to 0 if this is a leaf,
-	 * effectively ignoring the content. In a next step we could
-	 * trigger more readahead depending from the content, e.g.
-	 * fetch the checksums for the extents in the leaf.
-	 */
-	if (!btrfs_header_level(eb))
-		goto cleanup;
-
-	nritems = btrfs_header_nritems(eb);
-	generation = btrfs_header_generation(eb);
-	for (i = 0; i < nritems; i++) {
-		struct reada_extctl *rec;
-		u64 n_gen;
-		struct btrfs_key key;
-		struct btrfs_key next_key;
-
-		btrfs_node_key_to_cpu(eb, &key, i);
-		if (i + 1 < nritems)
-			btrfs_node_key_to_cpu(eb, &next_key, i + 1);
-		else
-			next_key = re->top;
-		bytenr = btrfs_node_blockptr(eb, i);
-		n_gen = btrfs_node_ptr_generation(eb, i);
-
-		list_for_each_entry(rec, &list, list) {
-			struct reada_control *rc = rec->rc;
-
-			/*
-			 * if the generation doesn't match, just ignore this
-			 * extctl. This will probably cut off a branch from
-			 * prefetch. Alternatively one could start a new (sub-)
-			 * prefetch for this branch, starting again from root.
-			 * FIXME: move the generation check out of this loop
-			 */
-#ifdef DEBUG
-			if (rec->generation != generation) {
-				btrfs_debug(fs_info,
-					    "generation mismatch for (%llu,%d,%llu) %llu != %llu",
-					    key.objectid, key.type, key.offset,
-					    rec->generation, generation);
-			}
-#endif
-			if (rec->generation == generation &&
-			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
-			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
-				reada_add_block(rc, bytenr, &next_key,
-						btrfs_header_owner(eb), n_gen,
-						btrfs_header_level(eb) - 1);
-		}
-	}
-
-cleanup:
-	/*
-	 * free extctl records
-	 */
-	while (!list_empty(&list)) {
-		struct reada_control *rc;
-		struct reada_extctl *rec;
-
-		rec = list_first_entry(&list, struct reada_extctl, list);
-		list_del(&rec->list);
-		rc = rec->rc;
-		kfree(rec);
-
-		kref_get(&rc->refcnt);
-		if (atomic_dec_and_test(&rc->elems)) {
-			kref_put(&rc->refcnt, reada_control_release);
-			wake_up(&rc->wait);
-		}
-		kref_put(&rc->refcnt, reada_control_release);
-
-		reada_extent_put(fs_info, re);	/* one ref for each entry */
-	}
-
-	return;
-}
-
-int btree_readahead_hook(struct extent_buffer *eb, int err)
-{
-	struct btrfs_fs_info *fs_info = eb->fs_info;
-	int ret = 0;
-	struct reada_extent *re;
-
-	/* find extent */
-	spin_lock(&fs_info->reada_lock);
-	re = radix_tree_lookup(&fs_info->reada_tree,
-			       eb->start >> fs_info->sectorsize_bits);
-	if (re)
-		re->refcnt++;
-	spin_unlock(&fs_info->reada_lock);
-	if (!re) {
-		ret = -1;
-		goto start_machine;
-	}
-
-	__readahead_hook(fs_info, re, eb, err);
-	reada_extent_put(fs_info, re);	/* our ref */
-
-start_machine:
-	reada_start_machine(fs_info);
-	return ret;
-}
-
-static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
-					  struct btrfs_io_context *bioc)
-{
-	struct btrfs_fs_info *fs_info = dev->fs_info;
-	int ret;
-	struct reada_zone *zone;
-	struct btrfs_block_group *cache = NULL;
-	u64 start;
-	u64 end;
-	int i;
-
-	zone = NULL;
-	spin_lock(&fs_info->reada_lock);
-	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-				     logical >> fs_info->sectorsize_bits, 1);
-	if (ret == 1 && logical >= zone->start && logical <= zone->end) {
-		kref_get(&zone->refcnt);
-		spin_unlock(&fs_info->reada_lock);
-		return zone;
-	}
-
-	spin_unlock(&fs_info->reada_lock);
-
-	cache = btrfs_lookup_block_group(fs_info, logical);
-	if (!cache)
-		return NULL;
-
-	start = cache->start;
-	end = start + cache->length - 1;
-	btrfs_put_block_group(cache);
-
-	zone = kzalloc(sizeof(*zone), GFP_KERNEL);
-	if (!zone)
-		return NULL;
-
-	ret = radix_tree_preload(GFP_KERNEL);
-	if (ret) {
-		kfree(zone);
-		return NULL;
-	}
-
-	zone->start = start;
-	zone->end = end;
-	INIT_LIST_HEAD(&zone->list);
-	spin_lock_init(&zone->lock);
-	zone->locked = 0;
-	kref_init(&zone->refcnt);
-	zone->elems = 0;
-	zone->device = dev; /* our device always sits at index 0 */
-	for (i = 0; i < bioc->num_stripes; ++i) {
-		/* bounds have already been checked */
-		zone->devs[i] = bioc->stripes[i].dev;
-	}
-	zone->ndevs = bioc->num_stripes;
-
-	spin_lock(&fs_info->reada_lock);
-	ret = radix_tree_insert(&dev->reada_zones,
-			(unsigned long)(zone->end >> fs_info->sectorsize_bits),
-			zone);
-
-	if (ret == -EEXIST) {
-		kfree(zone);
-		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-					logical >> fs_info->sectorsize_bits, 1);
-		if (ret == 1 && logical >= zone->start && logical <= zone->end)
-			kref_get(&zone->refcnt);
-		else
-			zone = NULL;
-	}
-	spin_unlock(&fs_info->reada_lock);
-	radix_tree_preload_end();
-
-	return zone;
-}
-
-static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
-					      u64 logical,
-					      struct btrfs_key *top,
-					      u64 owner_root, int level)
-{
-	int ret;
-	struct reada_extent *re = NULL;
-	struct reada_extent *re_exist = NULL;
-	struct btrfs_io_context *bioc = NULL;
-	struct btrfs_device *dev;
-	struct btrfs_device *prev_dev;
-	u64 length;
-	int real_stripes;
-	int nzones = 0;
-	unsigned long index = logical >> fs_info->sectorsize_bits;
-	int dev_replace_is_ongoing;
-	int have_zone = 0;
-
-	spin_lock(&fs_info->reada_lock);
-	re = radix_tree_lookup(&fs_info->reada_tree, index);
-	if (re)
-		re->refcnt++;
-	spin_unlock(&fs_info->reada_lock);
-
-	if (re)
-		return re;
-
-	re = kzalloc(sizeof(*re), GFP_KERNEL);
-	if (!re)
-		return NULL;
-
-	re->logical = logical;
-	re->top = *top;
-	INIT_LIST_HEAD(&re->extctl);
-	spin_lock_init(&re->lock);
-	re->refcnt = 1;
-	re->owner_root = owner_root;
-	re->level = level;
-
-	/*
-	 * map block
-	 */
-	length = fs_info->nodesize;
-	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-			      &length, &bioc, 0);
-	if (ret || !bioc || length < fs_info->nodesize)
-		goto error;
-
-	if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
-		btrfs_err(fs_info,
-			   "readahead: more than %d copies not supported",
-			   BTRFS_MAX_MIRRORS);
-		goto error;
-	}
-
-	real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
-	for (nzones = 0; nzones < real_stripes; ++nzones) {
-		struct reada_zone *zone;
-
-		dev = bioc->stripes[nzones].dev;
-
-		/* cannot read ahead on missing device. */
-		if (!dev->bdev)
-			continue;
-
-		zone = reada_find_zone(dev, logical, bioc);
-		if (!zone)
-			continue;
-
-		re->zones[re->nzones++] = zone;
-		spin_lock(&zone->lock);
-		if (!zone->elems)
-			kref_get(&zone->refcnt);
-		++zone->elems;
-		spin_unlock(&zone->lock);
-		spin_lock(&fs_info->reada_lock);
-		kref_put(&zone->refcnt, reada_zone_release);
-		spin_unlock(&fs_info->reada_lock);
-	}
-	if (re->nzones == 0) {
-		/* not a single zone found, error and out */
-		goto error;
-	}
-
-	/* Insert extent in reada tree + all per-device trees, all or nothing */
-	down_read(&fs_info->dev_replace.rwsem);
-	ret = radix_tree_preload(GFP_KERNEL);
-	if (ret) {
-		up_read(&fs_info->dev_replace.rwsem);
-		goto error;
-	}
-
-	spin_lock(&fs_info->reada_lock);
-	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
-	if (ret == -EEXIST) {
-		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
-		re_exist->refcnt++;
-		spin_unlock(&fs_info->reada_lock);
-		radix_tree_preload_end();
-		up_read(&fs_info->dev_replace.rwsem);
-		goto error;
-	}
-	if (ret) {
-		spin_unlock(&fs_info->reada_lock);
-		radix_tree_preload_end();
-		up_read(&fs_info->dev_replace.rwsem);
-		goto error;
-	}
-	radix_tree_preload_end();
-	prev_dev = NULL;
-	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
-			&fs_info->dev_replace);
-	for (nzones = 0; nzones < re->nzones; ++nzones) {
-		dev = re->zones[nzones]->device;
-
-		if (dev == prev_dev) {
-			/*
-			 * in case of DUP, just add the first zone. As both
-			 * are on the same device, there's nothing to gain
-			 * from adding both.
-			 * Also, it wouldn't work, as the tree is per device
-			 * and adding would fail with EEXIST
-			 */
-			continue;
-		}
-		if (!dev->bdev)
-			continue;
-
-		if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
-			continue;
-
-		if (dev_replace_is_ongoing &&
-		    dev == fs_info->dev_replace.tgtdev) {
-			/*
-			 * as this device is selected for reading only as
-			 * a last resort, skip it for read ahead.
-			 */
-			continue;
-		}
-		prev_dev = dev;
-		ret = radix_tree_insert(&dev->reada_extents, index, re);
-		if (ret) {
-			while (--nzones >= 0) {
-				dev = re->zones[nzones]->device;
-				BUG_ON(dev == NULL);
-				/* ignore whether the entry was inserted */
-				radix_tree_delete(&dev->reada_extents, index);
-			}
-			radix_tree_delete(&fs_info->reada_tree, index);
-			spin_unlock(&fs_info->reada_lock);
-			up_read(&fs_info->dev_replace.rwsem);
-			goto error;
-		}
-		have_zone = 1;
-	}
-	if (!have_zone)
-		radix_tree_delete(&fs_info->reada_tree, index);
-	spin_unlock(&fs_info->reada_lock);
-	up_read(&fs_info->dev_replace.rwsem);
-
-	if (!have_zone)
-		goto error;
-
-	btrfs_put_bioc(bioc);
-	return re;
-
-error:
-	for (nzones = 0; nzones < re->nzones; ++nzones) {
-		struct reada_zone *zone;
-
-		zone = re->zones[nzones];
-		kref_get(&zone->refcnt);
-		spin_lock(&zone->lock);
-		--zone->elems;
-		if (zone->elems == 0) {
-			/*
-			 * no fs_info->reada_lock needed, as this can't be
-			 * the last ref
-			 */
-			kref_put(&zone->refcnt, reada_zone_release);
-		}
-		spin_unlock(&zone->lock);
-
-		spin_lock(&fs_info->reada_lock);
-		kref_put(&zone->refcnt, reada_zone_release);
-		spin_unlock(&fs_info->reada_lock);
-	}
-	btrfs_put_bioc(bioc);
-	kfree(re);
-	return re_exist;
-}
-
-static void reada_extent_put(struct btrfs_fs_info *fs_info,
-			     struct reada_extent *re)
-{
-	int i;
-	unsigned long index = re->logical >> fs_info->sectorsize_bits;
-
-	spin_lock(&fs_info->reada_lock);
-	if (--re->refcnt) {
-		spin_unlock(&fs_info->reada_lock);
-		return;
-	}
-
-	radix_tree_delete(&fs_info->reada_tree, index);
-	for (i = 0; i < re->nzones; ++i) {
-		struct reada_zone *zone = re->zones[i];
-
-		radix_tree_delete(&zone->device->reada_extents, index);
-	}
-
-	spin_unlock(&fs_info->reada_lock);
-
-	for (i = 0; i < re->nzones; ++i) {
-		struct reada_zone *zone = re->zones[i];
-
-		kref_get(&zone->refcnt);
-		spin_lock(&zone->lock);
-		--zone->elems;
-		if (zone->elems == 0) {
-			/* no fs_info->reada_lock needed, as this can't be
-			 * the last ref */
-			kref_put(&zone->refcnt, reada_zone_release);
-		}
-		spin_unlock(&zone->lock);
-
-		spin_lock(&fs_info->reada_lock);
-		kref_put(&zone->refcnt, reada_zone_release);
-		spin_unlock(&fs_info->reada_lock);
-	}
-
-	kfree(re);
-}
-
-static void reada_zone_release(struct kref *kref)
-{
-	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
-	struct btrfs_fs_info *fs_info = zone->device->fs_info;
-
-	lockdep_assert_held(&fs_info->reada_lock);
-
-	radix_tree_delete(&zone->device->reada_zones,
-			  zone->end >> fs_info->sectorsize_bits);
-
-	kfree(zone);
-}
-
-static void reada_control_release(struct kref *kref)
-{
-	struct reada_control *rc = container_of(kref, struct reada_control,
-						refcnt);
-
-	kfree(rc);
-}
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
-			   struct btrfs_key *top, u64 owner_root,
-			   u64 generation, int level)
-{
-	struct btrfs_fs_info *fs_info = rc->fs_info;
-	struct reada_extent *re;
-	struct reada_extctl *rec;
-
-	/* takes one ref */
-	re = reada_find_extent(fs_info, logical, top, owner_root, level);
-	if (!re)
-		return -1;
-
-	rec = kzalloc(sizeof(*rec), GFP_KERNEL);
-	if (!rec) {
-		reada_extent_put(fs_info, re);
-		return -ENOMEM;
-	}
-
-	rec->rc = rc;
-	rec->generation = generation;
-	atomic_inc(&rc->elems);
-
-	spin_lock(&re->lock);
-	list_add_tail(&rec->list, &re->extctl);
-	spin_unlock(&re->lock);
-
-	/* leave the ref on the extent */
-
-	return 0;
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
-{
-	int i;
-	unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
-
-	for (i = 0; i < zone->ndevs; ++i) {
-		struct reada_zone *peer;
-		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
-		if (peer && peer->device != zone->device)
-			peer->locked = lock;
-	}
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static int reada_pick_zone(struct btrfs_device *dev)
-{
-	struct reada_zone *top_zone = NULL;
-	struct reada_zone *top_locked_zone = NULL;
-	u64 top_elems = 0;
-	u64 top_locked_elems = 0;
-	unsigned long index = 0;
-	int ret;
-
-	if (dev->reada_curr_zone) {
-		reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
-		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
-		dev->reada_curr_zone = NULL;
-	}
-	/* pick the zone with the most elements */
-	while (1) {
-		struct reada_zone *zone;
-
-		ret = radix_tree_gang_lookup(&dev->reada_zones,
-					     (void **)&zone, index, 1);
-		if (ret == 0)
-			break;
-		index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
-		if (zone->locked) {
-			if (zone->elems > top_locked_elems) {
-				top_locked_elems = zone->elems;
-				top_locked_zone = zone;
-			}
-		} else {
-			if (zone->elems > top_elems) {
-				top_elems = zone->elems;
-				top_zone = zone;
-			}
-		}
-	}
-	if (top_zone)
-		dev->reada_curr_zone = top_zone;
-	else if (top_locked_zone)
-		dev->reada_curr_zone = top_locked_zone;
-	else
-		return 0;
-
-	dev->reada_next = dev->reada_curr_zone->start;
-	kref_get(&dev->reada_curr_zone->refcnt);
-	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
-
-	return 1;
-}
-
-static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
-				    u64 owner_root, int level, int mirror_num,
-				    struct extent_buffer **eb)
-{
-	struct extent_buffer *buf = NULL;
-	int ret;
-
-	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
-	if (IS_ERR(buf))
-		return 0;
-
-	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
-
-	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
-	if (ret) {
-		free_extent_buffer_stale(buf);
-		return ret;
-	}
-
-	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
-		free_extent_buffer_stale(buf);
-		return -EIO;
-	} else if (extent_buffer_uptodate(buf)) {
-		*eb = buf;
-	} else {
-		free_extent_buffer(buf);
-	}
-	return 0;
-}
-
-static int reada_start_machine_dev(struct btrfs_device *dev)
-{
-	struct btrfs_fs_info *fs_info = dev->fs_info;
-	struct reada_extent *re = NULL;
-	int mirror_num = 0;
-	struct extent_buffer *eb = NULL;
-	u64 logical;
-	int ret;
-	int i;
-
-	spin_lock(&fs_info->reada_lock);
-	if (dev->reada_curr_zone == NULL) {
-		ret = reada_pick_zone(dev);
-		if (!ret) {
-			spin_unlock(&fs_info->reada_lock);
-			return 0;
-		}
-	}
-	/*
-	 * FIXME currently we issue the reads one extent at a time. If we have
-	 * a contiguous block of extents, we could also coagulate them or use
-	 * plugging to speed things up
-	 */
-	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-				dev->reada_next >> fs_info->sectorsize_bits, 1);
-	if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
-		ret = reada_pick_zone(dev);
-		if (!ret) {
-			spin_unlock(&fs_info->reada_lock);
-			return 0;
-		}
-		re = NULL;
-		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-				dev->reada_next >> fs_info->sectorsize_bits, 1);
-	}
-	if (ret == 0) {
-		spin_unlock(&fs_info->reada_lock);
-		return 0;
-	}
-	dev->reada_next = re->logical + fs_info->nodesize;
-	re->refcnt++;
-
-	spin_unlock(&fs_info->reada_lock);
-
-	spin_lock(&re->lock);
-	if (re->scheduled || list_empty(&re->extctl)) {
-		spin_unlock(&re->lock);
-		reada_extent_put(fs_info, re);
-		return 0;
-	}
-	re->scheduled = 1;
-	spin_unlock(&re->lock);
-
-	/*
-	 * find mirror num
-	 */
-	for (i = 0; i < re->nzones; ++i) {
-		if (re->zones[i]->device == dev) {
-			mirror_num = i + 1;
-			break;
-		}
-	}
-	logical = re->logical;
-
-	atomic_inc(&dev->reada_in_flight);
-	ret = reada_tree_block_flagged(fs_info, logical, re->owner_root,
-				       re->level, mirror_num, &eb);
-	if (ret)
-		__readahead_hook(fs_info, re, NULL, ret);
-	else if (eb)
-		__readahead_hook(fs_info, re, eb, ret);
-
-	if (eb)
-		free_extent_buffer(eb);
-
-	atomic_dec(&dev->reada_in_flight);
-	reada_extent_put(fs_info, re);
-
-	return 1;
-
-}
-
-static void reada_start_machine_worker(struct btrfs_work *work)
-{
-	struct reada_machine_work *rmw;
-	int old_ioprio;
-
-	rmw = container_of(work, struct reada_machine_work, work);
-
-	old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
-				       task_nice_ioprio(current));
-	set_task_ioprio(current, BTRFS_IOPRIO_READA);
-	__reada_start_machine(rmw->fs_info);
-	set_task_ioprio(current, old_ioprio);
-
-	atomic_dec(&rmw->fs_info->reada_works_cnt);
-
-	kfree(rmw);
-}
-
-/* Try to start up to 10k READA requests for a group of devices */
-static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
-{
-	u64 enqueued;
-	u64 total = 0;
-	struct btrfs_device *device;
-
-	do {
-		enqueued = 0;
-		list_for_each_entry(device, &fs_devices->devices, dev_list) {
-			if (atomic_read(&device->reada_in_flight) <
-			    MAX_IN_FLIGHT)
-				enqueued += reada_start_machine_dev(device);
-		}
-		total += enqueued;
-	} while (enqueued && total < 10000);
-
-	return total;
-}
-
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
-	int i;
-	u64 enqueued = 0;
-
-	mutex_lock(&fs_devices->device_list_mutex);
-
-	enqueued += reada_start_for_fsdevs(fs_devices);
-	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
-		enqueued += reada_start_for_fsdevs(seed_devs);
-
-	mutex_unlock(&fs_devices->device_list_mutex);
-	if (enqueued == 0)
-		return;
-
-	/*
-	 * If everything is already in the cache, this is effectively single
-	 * threaded. To a) not hold the caller for too long and b) to utilize
-	 * more cores, we broke the loop above after 10000 iterations and now
-	 * enqueue to workers to finish it. This will distribute the load to
-	 * the cores.
-	 */
-	for (i = 0; i < 2; ++i) {
-		reada_start_machine(fs_info);
-		if (atomic_read(&fs_info->reada_works_cnt) >
-		    BTRFS_MAX_MIRRORS * 2)
-			break;
-	}
-}
-
-static void reada_start_machine(struct btrfs_fs_info *fs_info)
-{
-	struct reada_machine_work *rmw;
-
-	rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
-	if (!rmw) {
-		/* FIXME we cannot handle this properly right now */
-		BUG();
-	}
-	btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
-	rmw->fs_info = fs_info;
-
-	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
-	atomic_inc(&fs_info->reada_works_cnt);
-}
-
-#ifdef DEBUG
-static void dump_devs(struct btrfs_fs_info *fs_info, int all)
-{
-	struct btrfs_device *device;
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
-	unsigned long index;
-	int ret;
-	int i;
-	int j;
-	int cnt;
-
-	spin_lock(&fs_info->reada_lock);
-	list_for_each_entry(device, &fs_devices->devices, dev_list) {
-		btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
-			atomic_read(&device->reada_in_flight));
-		index = 0;
-		while (1) {
-			struct reada_zone *zone;
-			ret = radix_tree_gang_lookup(&device->reada_zones,
-						     (void **)&zone, index, 1);
-			if (ret == 0)
-				break;
-			pr_debug("  zone %llu-%llu elems %llu locked %d devs",
-				    zone->start, zone->end, zone->elems,
-				    zone->locked);
-			for (j = 0; j < zone->ndevs; ++j) {
-				pr_cont(" %lld",
-					zone->devs[j]->devid);
-			}
-			if (device->reada_curr_zone == zone)
-				pr_cont(" curr off %llu",
-					device->reada_next - zone->start);
-			pr_cont("\n");
-			index = (zone->end >> fs_info->sectorsize_bits) + 1;
-		}
-		cnt = 0;
-		index = 0;
-		while (all) {
-			struct reada_extent *re = NULL;
-
-			ret = radix_tree_gang_lookup(&device->reada_extents,
-						     (void **)&re, index, 1);
-			if (ret == 0)
-				break;
-			pr_debug("  re: logical %llu size %u empty %d scheduled %d",
-				re->logical, fs_info->nodesize,
-				list_empty(&re->extctl), re->scheduled);
-
-			for (i = 0; i < re->nzones; ++i) {
-				pr_cont(" zone %llu-%llu devs",
-					re->zones[i]->start,
-					re->zones[i]->end);
-				for (j = 0; j < re->zones[i]->ndevs; ++j) {
-					pr_cont(" %lld",
-						re->zones[i]->devs[j]->devid);
-				}
-			}
-			pr_cont("\n");
-			index = (re->logical >> fs_info->sectorsize_bits) + 1;
-			if (++cnt > 15)
-				break;
-		}
-	}
-
-	index = 0;
-	cnt = 0;
-	while (all) {
-		struct reada_extent *re = NULL;
-
-		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
-					     index, 1);
-		if (ret == 0)
-			break;
-		if (!re->scheduled) {
-			index = (re->logical >> fs_info->sectorsize_bits) + 1;
-			continue;
-		}
-		pr_debug("re: logical %llu size %u list empty %d scheduled %d",
-			re->logical, fs_info->nodesize,
-			list_empty(&re->extctl), re->scheduled);
-		for (i = 0; i < re->nzones; ++i) {
-			pr_cont(" zone %llu-%llu devs",
-				re->zones[i]->start,
-				re->zones[i]->end);
-			for (j = 0; j < re->zones[i]->ndevs; ++j) {
-				pr_cont(" %lld",
-				       re->zones[i]->devs[j]->devid);
-			}
-		}
-		pr_cont("\n");
-		index = (re->logical >> fs_info->sectorsize_bits) + 1;
-	}
-	spin_unlock(&fs_info->reada_lock);
-}
-#endif
-
-/*
- * interface
- */
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
-			struct btrfs_key *key_start, struct btrfs_key *key_end)
-{
-	struct reada_control *rc;
-	u64 start;
-	u64 generation;
-	int ret;
-	int level;
-	struct extent_buffer *node;
-	static struct btrfs_key max_key = {
-		.objectid = (u64)-1,
-		.type = (u8)-1,
-		.offset = (u64)-1
-	};
-
-	rc = kzalloc(sizeof(*rc), GFP_KERNEL);
-	if (!rc)
-		return ERR_PTR(-ENOMEM);
-
-	rc->fs_info = root->fs_info;
-	rc->key_start = *key_start;
-	rc->key_end = *key_end;
-	atomic_set(&rc->elems, 0);
-	init_waitqueue_head(&rc->wait);
-	kref_init(&rc->refcnt);
-	kref_get(&rc->refcnt); /* one ref for having elements */
-
-	node = btrfs_root_node(root);
-	start = node->start;
-	generation = btrfs_header_generation(node);
-	level = btrfs_header_level(node);
-	free_extent_buffer(node);
-
-	ret = reada_add_block(rc, start, &max_key, root->root_key.objectid,
-			      generation, level);
-	if (ret) {
-		kfree(rc);
-		return ERR_PTR(ret);
-	}
-
-	reada_start_machine(root->fs_info);
-
-	return rc;
-}
-
-#ifdef DEBUG
-int btrfs_reada_wait(void *handle)
-{
-	struct reada_control *rc = handle;
-	struct btrfs_fs_info *fs_info = rc->fs_info;
-
-	while (atomic_read(&rc->elems)) {
-		if (!atomic_read(&fs_info->reada_works_cnt))
-			reada_start_machine(fs_info);
-		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
-				   5 * HZ);
-		dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
-	}
-
-	dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
-
-	kref_put(&rc->refcnt, reada_control_release);
-
-	return 0;
-}
-#else
-int btrfs_reada_wait(void *handle)
-{
-	struct reada_control *rc = handle;
-	struct btrfs_fs_info *fs_info = rc->fs_info;
-
-	while (atomic_read(&rc->elems)) {
-		if (!atomic_read(&fs_info->reada_works_cnt))
-			reada_start_machine(fs_info);
-		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
-				   (HZ + 9) / 10);
-	}
-
-	kref_put(&rc->refcnt, reada_control_release);
-
-	return 0;
-}
-#endif
-
-void btrfs_reada_detach(void *handle)
-{
-	struct reada_control *rc = handle;
-
-	kref_put(&rc->refcnt, reada_control_release);
-}
-
-/*
- * Before removing a device (device replace or device remove ioctls), call this
- * function to wait for all existing readahead requests on the device and to
- * make sure no one queues more readahead requests for the device.
- *
- * Must be called without holding neither the device list mutex nor the device
- * replace semaphore, otherwise it will deadlock.
- */
-void btrfs_reada_remove_dev(struct btrfs_device *dev)
-{
-	struct btrfs_fs_info *fs_info = dev->fs_info;
-
-	/* Serialize with readahead extent creation at reada_find_extent(). */
-	spin_lock(&fs_info->reada_lock);
-	set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
-	spin_unlock(&fs_info->reada_lock);
-
-	/*
-	 * There might be readahead requests added to the radix trees which
-	 * were not yet added to the readahead work queue. We need to start
-	 * them and wait for their completion, otherwise we can end up with
-	 * use-after-free problems when dropping the last reference on the
-	 * readahead extents and their zones, as they need to access the
-	 * device structure.
-	 */
-	reada_start_machine(fs_info);
-	btrfs_flush_workqueue(fs_info->readahead_workers);
-}
-
-/*
- * If when removing a device (device replace or device remove ioctls) an error
- * happens after calling btrfs_reada_remove_dev(), call this to undo what that
- * function did. This is safe to call even if btrfs_reada_remove_dev() was not
- * called before.
- */
-void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
-{
-	spin_lock(&dev->fs_info->reada_lock);
-	clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
-	spin_unlock(&dev->fs_info->reada_lock);
-}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ad10a5e70e73..679e881d7814 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3188,10 +3188,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	u64 physical_end;
 	u64 generation;
 	int mirror_num;
-	struct reada_control *reada1;
-	struct reada_control *reada2;
 	struct btrfs_key key;
-	struct btrfs_key key_end;
 	u64 increment = map->stripe_len;
 	u64 offset;
 	u64 extent_logical;
@@ -3242,11 +3239,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	path->skip_locking = 1;
 	path->reada = READA_FORWARD;
 
-	/*
-	 * trigger the readahead for extent tree csum tree and wait for
-	 * completion. During readahead, the scrub is officially paused
-	 * to not hold off transaction commits
-	 */
 	logical = base + offset;
 	physical_end = physical + nstripes * map->stripe_len;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -3261,36 +3253,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	scrub_blocked_if_needed(fs_info);
 
 	root = btrfs_extent_root(fs_info, logical);
-
-	/* FIXME it might be better to start readahead at commit root */
-	key.objectid = logical;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = (u64)0;
-	key_end.objectid = logic_end;
-	key_end.type = BTRFS_METADATA_ITEM_KEY;
-	key_end.offset = (u64)-1;
-	reada1 = btrfs_reada_add(root, &key, &key_end);
-
 	csum_root = btrfs_csum_root(fs_info, logical);
 
-	if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
-		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-		key.type = BTRFS_EXTENT_CSUM_KEY;
-		key.offset = logical;
-		key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-		key_end.type = BTRFS_EXTENT_CSUM_KEY;
-		key_end.offset = logic_end;
-		reada2 = btrfs_reada_add(csum_root, &key, &key_end);
-	} else {
-		reada2 = NULL;
-	}
-
-	if (!IS_ERR(reada1))
-		btrfs_reada_wait(reada1);
-	if (!IS_ERR_OR_NULL(reada2))
-		btrfs_reada_wait(reada2);
-
-
 	/*
 	 * collect all data csums for the stripe to avoid seeking during
 	 * the scrub. This might currently (crc32) end up to be about 1MB
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a1c54a2c787c..0ec09fe01be6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1842,7 +1842,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
-	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
 				new_pool_size);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f8be80aad8f0..b07d382d53a8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1166,7 +1166,6 @@ static void btrfs_close_one_device(struct btrfs_device *device)
 	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
 	ASSERT(list_empty(&device->dev_alloc_list));
 	ASSERT(list_empty(&device->post_commit_list));
-	ASSERT(atomic_read(&device->reada_in_flight) == 0);
 }
 
 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -2150,8 +2149,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = btrfs_shrink_device(device, 0);
-	if (!ret)
-		btrfs_reada_remove_dev(device);
 	if (ret)
 		goto error_undo;
 
@@ -2249,7 +2246,6 @@ out:
 	return ret;
 
 error_undo:
-	btrfs_reada_undo_remove_dev(device);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		mutex_lock(&fs_info->chunk_mutex);
 		list_add(&device->dev_alloc_list,
@@ -6980,11 +6976,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	INIT_LIST_HEAD(&dev->dev_alloc_list);
 	INIT_LIST_HEAD(&dev->post_commit_list);
 
-	atomic_set(&dev->reada_in_flight, 0);
 	atomic_set(&dev->dev_stats_ccnt, 0);
 	btrfs_device_data_ordered_init(dev);
-	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
-	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 	extent_io_tree_init(fs_info, &dev->alloc_state,
 			    IO_TREE_DEVICE_ALLOC_STATE, NULL);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9cf1d93a3d66..005c9e2a491a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -123,13 +123,6 @@ struct btrfs_device {
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_ctx;
 
-	/* readahead state */
-	atomic_t reada_in_flight;
-	u64 reada_next;
-	struct reada_zone *reada_curr_zone;
-	struct radix_tree_root reada_zones;
-	struct radix_tree_root reada_extents;
-
 	/* disk I/O failure stats. For detailed description refer to
 	 * enum btrfs_dev_stat_values in ioctl.h */
 	int dev_stats_valid;
-- 
cgit v1.2.3-59-g8ed1b


From d04fbe19aefd28570a442e79aae226dd742ddc4c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 15 Dec 2021 14:59:41 +0800
Subject: btrfs: scrub: cleanup the argument list of scrub_chunk()

The argument list of scrub_chunk() has the following problems:

- Duplicated @chunk_offset
  It is the same as btrfs_block_group::start.

- Confusing @length
  The most instinctive guess is chunk length, and one may want to delete
  it, but the truth is, it's the device extent length.

Fix this by:

- Remove @chunk_offset
  Use btrfs_block_group::start instead.

- Rename @length to @dev_extent_len
  Also rename the caller to remove the ambiguous naming.

- Rename @cache to @bg
  The "_cache" suffix for btrfs_block_group has been removed for a while.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 679e881d7814..834a71948d20 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3541,10 +3541,10 @@ out:
 }
 
 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+					  struct btrfs_block_group *bg,
 					  struct btrfs_device *scrub_dev,
-					  u64 chunk_offset, u64 length,
 					  u64 dev_offset,
-					  struct btrfs_block_group *cache)
+					  u64 dev_extent_len)
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
@@ -3554,7 +3554,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 	int ret = 0;
 
 	read_lock(&map_tree->lock);
-	em = lookup_extent_mapping(map_tree, chunk_offset, 1);
+	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
 	read_unlock(&map_tree->lock);
 
 	if (!em) {
@@ -3562,26 +3562,24 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 		 * Might have been an unused block group deleted by the cleaner
 		 * kthread or relocation.
 		 */
-		spin_lock(&cache->lock);
-		if (!cache->removed)
+		spin_lock(&bg->lock);
+		if (!bg->removed)
 			ret = -EINVAL;
-		spin_unlock(&cache->lock);
+		spin_unlock(&bg->lock);
 
 		return ret;
 	}
-
-	map = em->map_lookup;
-	if (em->start != chunk_offset)
+	if (em->start != bg->start)
 		goto out;
-
-	if (em->len < length)
+	if (em->len < dev_extent_len)
 		goto out;
 
+	map = em->map_lookup;
 	for (i = 0; i < map->num_stripes; ++i) {
 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
 		    map->stripes[i].physical == dev_offset) {
 			ret = scrub_stripe(sctx, map, scrub_dev, i,
-					   chunk_offset, length, cache);
+					   bg->start, dev_extent_len, bg);
 			if (ret)
 				goto out;
 		}
@@ -3619,7 +3617,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	struct btrfs_path *path;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_root *root = fs_info->dev_root;
-	u64 length;
 	u64 chunk_offset;
 	int ret = 0;
 	int ro_set;
@@ -3643,6 +3640,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	while (1) {
+		u64 dev_extent_len;
+
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			break;
@@ -3679,9 +3678,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			break;
 
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-		length = btrfs_dev_extent_length(l, dev_extent);
+		dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
 
-		if (found_key.offset + length <= start)
+		if (found_key.offset + dev_extent_len <= start)
 			goto skip;
 
 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
@@ -3815,13 +3814,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		scrub_pause_off(fs_info);
 		down_write(&dev_replace->rwsem);
-		dev_replace->cursor_right = found_key.offset + length;
+		dev_replace->cursor_right = found_key.offset + dev_extent_len;
 		dev_replace->cursor_left = found_key.offset;
 		dev_replace->item_needs_writeback = 1;
 		up_write(&dev_replace->rwsem);
 
-		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
-				  found_key.offset, cache);
+		ASSERT(cache->start == chunk_offset);
+		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
+				  dev_extent_len);
 
 		/*
 		 * flush, submit all pending read and write bios, afterwards
@@ -3902,7 +3902,7 @@ skip_unfreeze:
 			break;
 		}
 skip:
-		key.offset = found_key.offset + length;
+		key.offset = found_key.offset + dev_extent_len;
 		btrfs_release_path(path);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 2ae8ae3d3def4c3ba73a58cc2531c42f0916f14e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 15 Dec 2021 14:59:42 +0800
Subject: btrfs: scrub: cleanup the argument list of scrub_stripe()

The argument list of btrfs_stripe() has similar problems of
scrub_chunk():

- Duplicated and ambiguous @base argument
  Can be fetched from btrfs_block_group::bg.

- Ambiguous argument @length
  It's again device extent length

- Ambiguous argument @num
  The instinctive guess would be mirror number, but in fact it's stripe
  index.

Fix it by:

- Remove @base parameter

- Rename @length to @dev_extent_len

- Rename @num to @stripe_index

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 67 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 834a71948d20..2e9a322773f2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3166,10 +3166,10 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
 }
 
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+					   struct btrfs_block_group *bg,
 					   struct map_lookup *map,
 					   struct btrfs_device *scrub_dev,
-					   int num, u64 base, u64 length,
-					   struct btrfs_block_group *cache)
+					   int stripe_index, u64 dev_extent_len)
 {
 	struct btrfs_path *path;
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3177,6 +3177,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	struct btrfs_root *csum_root;
 	struct btrfs_extent_item *extent;
 	struct blk_plug plug;
+	const u64 chunk_logical = bg->start;
 	u64 flags;
 	int ret;
 	int slot;
@@ -3204,25 +3205,26 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	int extent_mirror_num;
 	int stop_loop = 0;
 
-	physical = map->stripes[num].physical;
+	physical = map->stripes[stripe_index].physical;
 	offset = 0;
-	nstripes = div64_u64(length, map->stripe_len);
+	nstripes = div64_u64(dev_extent_len, map->stripe_len);
 	mirror_num = 1;
 	increment = map->stripe_len;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-		offset = map->stripe_len * num;
+		offset = map->stripe_len * stripe_index;
 		increment = map->stripe_len * map->num_stripes;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
-		offset = map->stripe_len * (num / map->sub_stripes);
+		offset = map->stripe_len * (stripe_index / map->sub_stripes);
 		increment = map->stripe_len * factor;
-		mirror_num = num % map->sub_stripes + 1;
+		mirror_num = stripe_index % map->sub_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
-		mirror_num = num % map->num_stripes + 1;
+		mirror_num = stripe_index % map->num_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		mirror_num = num % map->num_stripes + 1;
+		mirror_num = stripe_index % map->num_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		get_raid56_logic_offset(physical, num, map, &offset, NULL);
+		get_raid56_logic_offset(physical, stripe_index, map, &offset,
+					NULL);
 		increment = map->stripe_len * nr_data_stripes(map);
 	}
 
@@ -3239,12 +3241,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	path->skip_locking = 1;
 	path->reada = READA_FORWARD;
 
-	logical = base + offset;
+	logical = chunk_logical + offset;
 	physical_end = physical + nstripes * map->stripe_len;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		get_raid56_logic_offset(physical_end, num,
+		get_raid56_logic_offset(physical_end, stripe_index,
 					map, &logic_end, NULL);
-		logic_end += base;
+		logic_end += chunk_logical;
 	} else {
 		logic_end = logical + increment * nstripes;
 	}
@@ -3299,13 +3301,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 		}
 
 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-			ret = get_raid56_logic_offset(physical, num, map,
-						      &logical,
+			ret = get_raid56_logic_offset(physical, stripe_index,
+						      map, &logical,
 						      &stripe_logical);
-			logical += base;
+			logical += chunk_logical;
 			if (ret) {
 				/* it is parity strip */
-				stripe_logical += base;
+				stripe_logical += chunk_logical;
 				stripe_end = stripe_logical + increment;
 				ret = scrub_raid56_parity(sctx, map, scrub_dev,
 							  stripe_logical,
@@ -3385,13 +3387,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			 * Continuing would prevent reusing its device extents
 			 * for new block groups for a long time.
 			 */
-			spin_lock(&cache->lock);
-			if (cache->removed) {
-				spin_unlock(&cache->lock);
+			spin_lock(&bg->lock);
+			if (bg->removed) {
+				spin_unlock(&bg->lock);
 				ret = 0;
 				goto out;
 			}
-			spin_unlock(&cache->lock);
+			spin_unlock(&bg->lock);
 
 			extent = btrfs_item_ptr(l, slot,
 						struct btrfs_extent_item);
@@ -3470,12 +3472,12 @@ again:
 loop:
 					physical += map->stripe_len;
 					ret = get_raid56_logic_offset(physical,
-							num, map, &logical,
-							&stripe_logical);
-					logical += base;
+							stripe_index, map,
+							&logical, &stripe_logical);
+					logical += chunk_logical;
 
 					if (ret && physical < physical_end) {
-						stripe_logical += base;
+						stripe_logical += chunk_logical;
 						stripe_end = stripe_logical +
 								increment;
 						ret = scrub_raid56_parity(sctx,
@@ -3509,8 +3511,8 @@ skip:
 		physical += map->stripe_len;
 		spin_lock(&sctx->stat_lock);
 		if (stop_loop)
-			sctx->stat.last_physical = map->stripes[num].physical +
-						   length;
+			sctx->stat.last_physical = map->stripes[stripe_index].physical +
+						   dev_extent_len;
 		else
 			sctx->stat.last_physical = physical;
 		spin_unlock(&sctx->stat_lock);
@@ -3530,9 +3532,10 @@ out:
 	if (sctx->is_dev_replace && ret >= 0) {
 		int ret2;
 
-		ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
-						    map->stripes[num].physical,
-						    physical_end);
+		ret2 = sync_write_pointer_for_zoned(sctx,
+				chunk_logical + offset,
+				map->stripes[stripe_index].physical,
+				physical_end);
 		if (ret2)
 			ret = ret2;
 	}
@@ -3578,8 +3581,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 	for (i = 0; i < map->num_stripes; ++i) {
 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
 		    map->stripes[i].physical == dev_offset) {
-			ret = scrub_stripe(sctx, map, scrub_dev, i,
-					   bg->start, dev_extent_len, bg);
+			ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
+					   dev_extent_len);
 			if (ret)
 				goto out;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 856e47946c6da280816ed9b9c32083c102838ba0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 30 Dec 2021 16:45:13 +0800
Subject: btrfs: selftests: dump extent io tree if extent-io-tree test failed

When code modifying extent-io-tree get modified and got that selftest
failed, it can take some time to pin down the cause.

To make it easier to expose the problem, dump the extent io tree if the
selftest failed.

This can save developers debug time, especially since the selftest we
can not use the trace events, thus have to manually add debug trace
points.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/extent-io-tests.c | 52 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index c2e72e7a8ff0..a232b15b8021 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -56,6 +56,54 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
 	return count;
 }
 
+#define STATE_FLAG_STR_LEN			256
+
+#define PRINT_ONE_FLAG(state, dest, cur, name)				\
+({									\
+	if (state->state & EXTENT_##name)				\
+		cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur,	\
+				 "%s" #name, cur == 0 ? "" : "|");	\
+})
+
+static void extent_flag_to_str(const struct extent_state *state, char *dest)
+{
+	int cur = 0;
+
+	dest[0] = 0;
+	PRINT_ONE_FLAG(state, dest, cur, DIRTY);
+	PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
+	PRINT_ONE_FLAG(state, dest, cur, LOCKED);
+	PRINT_ONE_FLAG(state, dest, cur, NEW);
+	PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
+	PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
+	PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
+	PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
+	PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
+	PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
+	PRINT_ONE_FLAG(state, dest, cur, DAMAGED);
+	PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
+	PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
+	PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
+}
+
+static void dump_extent_io_tree(const struct extent_io_tree *tree)
+{
+	struct rb_node *node;
+	char flags_str[STATE_FLAG_STR_LEN];
+
+	node = rb_first(&tree->state);
+	test_msg("io tree content:");
+	while (node) {
+		struct extent_state *state;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+		extent_flag_to_str(state, flags_str);
+		test_msg("  start=%llu len=%llu flags=%s", state->start,
+			 state->end + 1 - state->start, flags_str);
+		node = rb_next(node);
+	}
+}
+
 static int test_find_delalloc(u32 sectorsize)
 {
 	struct inode *inode;
@@ -258,6 +306,8 @@ static int test_find_delalloc(u32 sectorsize)
 	}
 	ret = 0;
 out_bits:
+	if (ret)
+		dump_extent_io_tree(tmp);
 	clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
 out:
 	if (locked_page)
@@ -534,6 +584,8 @@ static int test_find_first_clear_extent_bit(void)
 
 	ret = 0;
 out:
+	if (ret)
+		dump_extent_io_tree(&tree);
 	clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 4a9e803e5b392e2b1e511d0097f6d2c7e6ea41e7 Mon Sep 17 00:00:00 2001
From: Su Yue <l@damenly.su>
Date: Mon, 27 Dec 2021 18:18:39 +0800
Subject: btrfs: remove unnecessary parameter type from
 compression_decompress_bio

btrfs_decompress_bio, the only caller of compression_decompress_bio gets
type from @cb and passes it to compression_decompress_bio.
However, compression_decompress_bio can get compression type directly
from @cb.

So remove the parameter and access it through @cb.  No functional
change.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Su Yue <l@damenly.su>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e776956d5bc9..71e5b2e9a1ba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -96,10 +96,10 @@ static int compression_compress_pages(int type, struct list_head *ws,
 	}
 }
 
-static int compression_decompress_bio(int type, struct list_head *ws,
-		struct compressed_bio *cb)
+static int compression_decompress_bio(struct list_head *ws,
+				      struct compressed_bio *cb)
 {
-	switch (type) {
+	switch (cb->compress_type) {
 	case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
 	case BTRFS_COMPRESS_LZO:  return lzo_decompress_bio(ws, cb);
 	case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
@@ -1360,7 +1360,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
 	int type = cb->compress_type;
 
 	workspace = get_workspace(type, 0);
-	ret = compression_decompress_bio(type, workspace, cb);
+	ret = compression_decompress_bio(workspace, cb);
 	put_workspace(type, workspace);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From be8d1a2ab98998d369c14c3deb600a31708a66c5 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Mon, 20 Dec 2021 15:23:06 +0800
Subject: btrfs: fix argument list that the kdoc format and script verified

The warnings were found by running scripts/kernel-doc, which is
caused by using 'make W=1'.

fs/btrfs/extent_io.c:3210: warning: Function parameter or member
'bio_ctrl' not described in 'btrfs_bio_add_page'
fs/btrfs/extent_io.c:3210: warning: Excess function parameter 'bio'
description in 'btrfs_bio_add_page'
fs/btrfs/extent_io.c:3210: warning: Excess function parameter
'prev_bio_flags' description in 'btrfs_bio_add_page'
fs/btrfs/space-info.c:1602: warning: Excess function parameter 'root'
description in 'btrfs_reserve_metadata_bytes'
fs/btrfs/space-info.c:1602: warning: Function parameter or member
'fs_info' not described in 'btrfs_reserve_metadata_bytes'

Note: this is fixing only the warnings regarding parameter list, the
first line is not strictly conforming to the kdoc format as the btrfs
codebase does not stick to that and keeps the first line more free form
(because it's only for internal use).

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add note ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c  | 5 ++---
 fs/btrfs/space-info.c | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 73b2f1c805c0..d6d48ecf823c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3184,13 +3184,12 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
 /**
  * Attempt to add a page to bio
  *
- * @bio:	destination bio
+ * @bio_ctrl:	record both the bio, and its bio_flags
  * @page:	page to add to the bio
  * @disk_bytenr:  offset of the new bio or to check whether we are adding
  *                a contiguous page to the previous one
- * @pg_offset:	starting offset in the page
  * @size:	portion of page that we want to write
- * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
+ * @pg_offset:	starting offset in the page
  * @bio_flags:	flags of the current bio to see if we can merge them
  *
  * Attempt to add a page to bio considering stripe alignment etc.
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 79fe0ad17acf..294242c194d8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1588,7 +1588,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 /**
  * Trye to reserve metadata bytes from the block_rsv's space
  *
- * @root:       the root we're allocating for
+ * @fs_info:    the filesystem
  * @block_rsv:  block_rsv we're allocating for
  * @orig_bytes: number of bytes we want
  * @flush:      whether or not we can flush to make our reservation
-- 
cgit v1.2.3-59-g8ed1b


From c2f822635df873c510bda6fb7fd1b10b7c31be2d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 16 Dec 2021 15:00:32 +0000
Subject: btrfs: respect the max size in the header when activating swap file

If we extended the size of a swapfile after its header was created (by the
mkswap utility) and then try to activate it, we will map the entire file
when activating the swap file, instead of limiting to the max size defined
in the swap file's header.

Currently test case generic/643 from fstests fails because we do not
respect that size limit defined in the swap file's header.

So fix this by not mapping file ranges beyond the max size defined in the
swap header.

This is the same type of bug that iomap used to have, and was fixed in
commit 36ca7943ac18ae ("mm/swap: consider max pages in
iomap_swapfile_add_extent").

Fixes: ed46ff3d423780 ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-and-tested-by: Josef Bacik <josef@toxicpanda.com
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a88130c7782e..3b2403b6127f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10269,9 +10269,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
 				 struct btrfs_swap_info *bsi)
 {
 	unsigned long nr_pages;
+	unsigned long max_pages;
 	u64 first_ppage, first_ppage_reported, next_ppage;
 	int ret;
 
+	/*
+	 * Our swapfile may have had its size extended after the swap header was
+	 * written. In that case activating the swapfile should not go beyond
+	 * the max size set in the swap header.
+	 */
+	if (bsi->nr_pages >= sis->max)
+		return 0;
+
+	max_pages = sis->max - bsi->nr_pages;
 	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
 	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
 				PAGE_SIZE) >> PAGE_SHIFT;
@@ -10279,6 +10289,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
 	if (first_ppage >= next_ppage)
 		return 0;
 	nr_pages = next_ppage - first_ppage;
+	nr_pages = min(nr_pages, max_pages);
 
 	first_ppage_reported = first_ppage;
 	if (bsi->start == 0)
-- 
cgit v1.2.3-59-g8ed1b


From 36c86a9e1be3b29f9f075a946df55dfe1d818019 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 16 Dec 2021 19:47:36 +0800
Subject: btrfs: output more debug messages for uncommitted transaction

Print extra information about how many dirty bytes an uncommitted
has at the end of mount.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41dea24e2681..87a5addbedf6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4483,6 +4483,48 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
 	return btrfs_commit_transaction(trans);
 }
 
+static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_transaction *trans;
+	struct btrfs_transaction *tmp;
+	bool found = false;
+
+	if (list_empty(&fs_info->trans_list))
+		return;
+
+	/*
+	 * This function is only called at the very end of close_ctree(),
+	 * thus no other running transaction, no need to take trans_lock.
+	 */
+	ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
+	list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
+		struct extent_state *cached = NULL;
+		u64 dirty_bytes = 0;
+		u64 cur = 0;
+		u64 found_start;
+		u64 found_end;
+
+		found = true;
+		while (!find_first_extent_bit(&trans->dirty_pages, cur,
+			&found_start, &found_end, EXTENT_DIRTY, &cached)) {
+			dirty_bytes += found_end + 1 - found_start;
+			cur = found_end + 1;
+		}
+		btrfs_warn(fs_info,
+	"transaction %llu (with %llu dirty metadata bytes) is not committed",
+			   trans->transid, dirty_bytes);
+		btrfs_cleanup_one_transaction(trans, fs_info);
+
+		if (trans == fs_info->running_transaction)
+			fs_info->running_transaction = NULL;
+		list_del_init(&trans->list);
+
+		btrfs_put_transaction(trans);
+		trace_btrfs_transaction_commit(fs_info);
+	}
+	ASSERT(!found);
+}
+
 void __cold close_ctree(struct btrfs_fs_info *fs_info)
 {
 	int ret;
@@ -4591,7 +4633,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 	btrfs_stop_all_workers(fs_info);
 
 	/* We shouldn't have any transaction open at this point */
-	ASSERT(list_empty(&fs_info->trans_list));
+	warn_about_uncommitted_trans(fs_info);
 
 	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
 	free_root_pointers(fs_info, true);
-- 
cgit v1.2.3-59-g8ed1b