From f15fb2cd979a07fbfc666e2f04b8b30ec9233b2a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 10 Oct 2022 18:36:06 +0800
Subject: btrfs: raid56: properly handle the error when unable to find the
 missing stripe

In raid56_alloc_missing_rbio(), if we can not determine where the
missing device is inside the full stripe, we just BUG_ON().

This is not necessary especially the only caller inside scrub.c is
already properly checking the return value, and will treat it as a
memory allocation failure.

Fix the error handling by:

- Add an extra warning for the reason
  Although personally speaking it may be better to be an ASSERT().

- Properly free the allocated rbio

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f6395e8288d6..892005f756cf 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2742,8 +2742,10 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
 
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {
-		BUG();
-		kfree(rbio);
+		btrfs_warn_rl(fs_info,
+	"can not determine the failed stripe number for full stripe %llu",
+			      bioc->raid_map[0]);
+		__free_raid_bio(rbio);
 		return NULL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From ab4c54c643a01067669df8332b64e3f31b69e071 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 10 Oct 2022 18:36:07 +0800
Subject: btrfs: raid56: avoid double freeing for rbio if full_stripe_write()
 failed

Currently if full_stripe_write() failed to allocate the pages for
parity, it will call __free_raid_bio() first, then return -ENOMEM.

But some caller of full_stripe_write() will also call __free_raid_bio()
again, this would cause double freeing.

And it's not a logically sound either, normally we should either free
the memory at the same level where we allocated it, or let endio to
handle everything.

So this patch will solve the double freeing by make
raid56_parity_write() to handle the error and free the rbio.

Just like what we do in raid56_parity_recover().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 892005f756cf..82c8e991300e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1632,10 +1632,8 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio)
 	int ret;
 
 	ret = alloc_rbio_parity_pages(rbio);
-	if (ret) {
-		__free_raid_bio(rbio);
+	if (ret)
 		return ret;
-	}
 
 	ret = lock_stripe_add(rbio);
 	if (ret == 0)
@@ -1823,8 +1821,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 	 */
 	if (rbio_is_full(rbio)) {
 		ret = full_stripe_write(rbio);
-		if (ret)
+		if (ret) {
+			__free_raid_bio(rbio);
 			goto fail;
+		}
 		return;
 	}
 
@@ -1838,8 +1838,10 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
 	} else {
 		ret = __raid56_parity_write(rbio);
-		if (ret)
+		if (ret) {
+			__free_raid_bio(rbio);
 			goto fail;
+		}
 	}
 
 	return;
-- 
cgit v1.2.3-59-g8ed1b


From ae0e5df4d1a4a2694c9c203cc25334aaaf9f2dfa Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 11 Oct 2022 12:02:31 +0200
Subject: btrfs: reorder btrfs_bio for better packing

After changes in commit 917f32a23501 ("btrfs: give struct btrfs_bio a
real end_io handler") the layout of btrfs_bio can be improved.  There
are two holes and the structure size is 264 bytes on release build. By
reordering the iterator we can get rid of the holes and the size is 256
bytes which fits to slabs much better.

Final layout:

struct btrfs_bio {
	unsigned int               mirror_num;           /*     0     4 */
	struct bvec_iter           iter;                 /*     4    20 */
	u64                        file_offset;          /*    24     8 */
	struct btrfs_device *      device;               /*    32     8 */
	u8 *                       csum;                 /*    40     8 */
	u8                         csum_inline[64];      /*    48    64 */
	/* --- cacheline 1 boundary (64 bytes) was 48 bytes ago --- */
	btrfs_bio_end_io_t         end_io;               /*   112     8 */
	void *                     private;              /*   120     8 */
	/* --- cacheline 2 boundary (128 bytes) --- */
	struct work_struct         end_io_work;          /*   128    32 */
	struct bio                 bio;                  /*   160    96 */

	/* size: 256, cachelines: 4, members: 10 */
};

Fixes: 917f32a23501 ("btrfs: give struct btrfs_bio a real end_io handler")
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 599b9d5af349..f8b668dc8bf8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -395,6 +395,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
  */
 struct btrfs_bio {
 	unsigned int mirror_num;
+	struct bvec_iter iter;
 
 	/* for direct I/O */
 	u64 file_offset;
@@ -403,7 +404,6 @@ struct btrfs_bio {
 	struct btrfs_device *device;
 	u8 *csum;
 	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
-	struct bvec_iter iter;
 
 	/* End I/O information supplied to btrfs_bio_alloc */
 	btrfs_bio_end_io_t end_io;
-- 
cgit v1.2.3-59-g8ed1b


From 968b71583130b6104c9f33ba60446d598e327a8b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 14 Oct 2022 08:52:46 -0400
Subject: btrfs: fix tree mod log mishandling of reallocated nodes

We have been seeing the following panic in production

  kernel BUG at fs/btrfs/tree-mod-log.c:677!
  invalid opcode: 0000 [#1] SMP
  RIP: 0010:tree_mod_log_rewind+0x1b4/0x200
  RSP: 0000:ffffc9002c02f890 EFLAGS: 00010293
  RAX: 0000000000000003 RBX: ffff8882b448c700 RCX: 0000000000000000
  RDX: 0000000000008000 RSI: 00000000000000a7 RDI: ffff88877d831c00
  RBP: 0000000000000002 R08: 000000000000009f R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000100c40 R12: 0000000000000001
  R13: ffff8886c26d6a00 R14: ffff88829f5424f8 R15: ffff88877d831a00
  FS:  00007fee1d80c780(0000) GS:ffff8890400c0000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fee1963a020 CR3: 0000000434f33002 CR4: 00000000007706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  PKRU: 55555554
  Call Trace:
   btrfs_get_old_root+0x12b/0x420
   btrfs_search_old_slot+0x64/0x2f0
   ? tree_mod_log_oldest_root+0x3d/0xf0
   resolve_indirect_ref+0xfd/0x660
   ? ulist_alloc+0x31/0x60
   ? kmem_cache_alloc_trace+0x114/0x2c0
   find_parent_nodes+0x97a/0x17e0
   ? ulist_alloc+0x30/0x60
   btrfs_find_all_roots_safe+0x97/0x150
   iterate_extent_inodes+0x154/0x370
   ? btrfs_search_path_in_tree+0x240/0x240
   iterate_inodes_from_logical+0x98/0xd0
   ? btrfs_search_path_in_tree+0x240/0x240
   btrfs_ioctl_logical_to_ino+0xd9/0x180
   btrfs_ioctl+0xe2/0x2ec0
   ? __mod_memcg_lruvec_state+0x3d/0x280
   ? do_sys_openat2+0x6d/0x140
   ? kretprobe_dispatcher+0x47/0x70
   ? kretprobe_rethook_handler+0x38/0x50
   ? rethook_trampoline_handler+0x82/0x140
   ? arch_rethook_trampoline_callback+0x3b/0x50
   ? kmem_cache_free+0xfb/0x270
   ? do_sys_openat2+0xd5/0x140
   __x64_sys_ioctl+0x71/0xb0
   do_syscall_64+0x2d/0x40

Which is this code in tree_mod_log_rewind()

	switch (tm->op) {
        case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
		BUG_ON(tm->slot < n);

This occurs because we replay the nodes in order that they happened, and
when we do a REPLACE we will log a REMOVE_WHILE_FREEING for every slot,
starting at 0.  'n' here is the number of items in this block, which in
this case was 1, but we had 2 REMOVE_WHILE_FREEING operations.

The actual root cause of this was that we were replaying operations for
a block that shouldn't have been replayed.  Consider the following
sequence of events

1. We have an already modified root, and we do a btrfs_get_tree_mod_seq().
2. We begin removing items from this root, triggering KEY_REPLACE for
   it's child slots.
3. We remove one of the 2 children this root node points to, thus triggering
   the root node promotion of the remaining child, and freeing this node.
4. We modify a new root, and re-allocate the above node to the root node of
   this other root.

The tree mod log looks something like this

	logical 0	op KEY_REPLACE (slot 1)			seq 2
	logical 0	op KEY_REMOVE (slot 1)			seq 3
	logical 0	op KEY_REMOVE_WHILE_FREEING (slot 0)	seq 4
	logical 4096	op LOG_ROOT_REPLACE (old logical 0)	seq 5
	logical 8192	op KEY_REMOVE_WHILE_FREEING (slot 1)	seq 6
	logical 8192	op KEY_REMOVE_WHILE_FREEING (slot 0)	seq 7
	logical 0	op LOG_ROOT_REPLACE (old logical 8192)	seq 8

>From here the bug is triggered by the following steps

1.  Call btrfs_get_old_root() on the new_root.
2.  We call tree_mod_log_oldest_root(btrfs_root_node(new_root)), which is
    currently logical 0.
3.  tree_mod_log_oldest_root() calls tree_mod_log_search_oldest(), which
    gives us the KEY_REPLACE seq 2, and since that's not a
    LOG_ROOT_REPLACE we incorrectly believe that we don't have an old
    root, because we expect that the most recent change should be a
    LOG_ROOT_REPLACE.
4.  Back in tree_mod_log_oldest_root() we don't have a LOG_ROOT_REPLACE,
    so we don't set old_root, we simply use our existing extent buffer.
5.  Since we're using our existing extent buffer (logical 0) we call
    tree_mod_log_search(0) in order to get the newest change to start the
    rewind from, which ends up being the LOG_ROOT_REPLACE at seq 8.
6.  Again since we didn't find an old_root we simply clone logical 0 at
    it's current state.
7.  We call tree_mod_log_rewind() with the cloned extent buffer.
8.  Set n = btrfs_header_nritems(logical 0), which would be whatever the
    original nritems was when we COWed the original root, say for this
    example it's 2.
9.  We start from the newest operation and work our way forward, so we
    see LOG_ROOT_REPLACE which we ignore.
10. Next we see KEY_REMOVE_WHILE_FREEING for slot 0, which triggers the
    BUG_ON(tm->slot < n), because it expects if we've done this we have a
    completely empty extent buffer to replay completely.

The correct thing would be to find the first LOG_ROOT_REPLACE, and then
get the old_root set to logical 8192.  In fact making that change fixes
this particular problem.

However consider the much more complicated case.  We have a child node
in this tree and the above situation.  In the above case we freed one
of the child blocks at the seq 3 operation.  If this block was also
re-allocated and got new tree mod log operations we would have a
different problem.  btrfs_search_old_slot(orig root) would get down to
the logical 0 root that still pointed at that node.  However in
btrfs_search_old_slot() we call tree_mod_log_rewind(buf) directly.  This
is not context aware enough to know which operations we should be
replaying.  If the block was re-allocated multiple times we may only
want to replay a range of operations, and determining what that range is
isn't possible to determine.

We could maybe solve this by keeping track of which root the node
belonged to at every tree mod log operation, and then passing this
around to make sure we're only replaying operations that relate to the
root we're trying to rewind.

However there's a simpler way to solve this problem, simply disallow
reallocations if we have currently running tree mod log users.  We
already do this for leaf's, so we're simply expanding this to nodes as
well.  This is a relatively uncommon occurrence, and the problem is
complicated enough I'm worried that we will still have corner cases in
the reallocation case.  So fix this in the most straightforward way
possible.

Fixes: bd989ba359f2 ("Btrfs: add tree modification log functions")
CC: stable@vger.kernel.org # 3.3+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cd2d36580f1a..2801c991814f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3295,21 +3295,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		}
 
 		/*
-		 * If this is a leaf and there are tree mod log users, we may
-		 * have recorded mod log operations that point to this leaf.
-		 * So we must make sure no one reuses this leaf's extent before
-		 * mod log operations are applied to a node, otherwise after
-		 * rewinding a node using the mod log operations we get an
-		 * inconsistent btree, as the leaf's extent may now be used as
-		 * a node or leaf for another different btree.
+		 * If there are tree mod log users we may have recorded mod log
+		 * operations for this node.  If we re-allocate this node we
+		 * could replay operations on this node that happened when it
+		 * existed in a completely different root.  For example if it
+		 * was part of root A, then was reallocated to root B, and we
+		 * are doing a btrfs_old_search_slot(root b), we could replay
+		 * operations that happened when the block was part of root A,
+		 * giving us an inconsistent view of the btree.
+		 *
 		 * We are safe from races here because at this point no other
 		 * node or root points to this extent buffer, so if after this
-		 * check a new tree mod log user joins, it will not be able to
-		 * find a node pointing to this leaf and record operations that
-		 * point to this leaf.
+		 * check a new tree mod log user joins we will not have an
+		 * existing log of operations on this node that we have to
+		 * contend with.
 		 */
-		if (btrfs_header_level(buf) == 0 &&
-		    test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+		if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 			must_pin = true;
 
 		if (must_pin || btrfs_is_zoned(fs_info)) {
-- 
cgit v1.2.3-59-g8ed1b


From 3d17adea74a56a4965f7a603d8ed8c66bb9356d9 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 18 Oct 2022 09:56:38 +0800
Subject: btrfs: make thaw time super block check to also verify checksum

Previous commit a05d3c915314 ("btrfs: check superblock to ensure the fs
was not modified at thaw time") only checks the content of the super
block, but it doesn't really check if the on-disk super block has a
matching checksum.

This patch will add the checksum verification to thaw time superblock
verification.

This involves the following extra changes:

- Export btrfs_check_super_csum()
  As we need to call it in super.c.

- Change the argument list of btrfs_check_super_csum()
  Instead of passing a char *, directly pass struct btrfs_super_block *
  pointer.

- Verify that our checksum type didn't change before checking the
  checksum value, like it's done at mount time

Fixes: a05d3c915314 ("btrfs: check superblock to ensure the fs was not modified at thaw time")
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 10 ++++------
 fs/btrfs/disk-io.h |  2 ++
 fs/btrfs/super.c   | 16 ++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a2da9313c694..4b28263c3d32 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -166,11 +166,9 @@ static bool btrfs_supported_super_csum(u16 csum_type)
  * Return 0 if the superblock checksum type matches the checksum value of that
  * algorithm. Pass the raw disk superblock data.
  */
-static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
-				  char *raw_disk_sb)
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+			   const struct btrfs_super_block *disk_sb)
 {
-	struct btrfs_super_block *disk_sb =
-		(struct btrfs_super_block *)raw_disk_sb;
 	char result[BTRFS_CSUM_SIZE];
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 
@@ -181,7 +179,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
 	 * filled with zeros and is included in the checksum.
 	 */
-	crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+	crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
 			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
 
 	if (memcmp(disk_sb->csum, result, fs_info->csum_size))
@@ -3479,7 +3477,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	 * We want to check superblock checksum, the type is stored inside.
 	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
 	 */
-	if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
+	if (btrfs_check_super_csum(fs_info, disk_super)) {
 		btrfs_err(fs_info, "superblock checksum mismatch");
 		err = -EINVAL;
 		btrfs_release_disk_super(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c67c15d4d20b..9fa923e005a3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -42,6 +42,8 @@ struct extent_buffer *btrfs_find_create_tree_block(
 void btrfs_clean_tree_block(struct extent_buffer *buf);
 void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+			   const struct btrfs_super_block *disk_sb);
 int __cold open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9be4fd2db0f4..5942b9384088 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2555,6 +2555,7 @@ static int check_dev_super(struct btrfs_device *dev)
 {
 	struct btrfs_fs_info *fs_info = dev->fs_info;
 	struct btrfs_super_block *sb;
+	u16 csum_type;
 	int ret = 0;
 
 	/* This should be called with fs still frozen. */
@@ -2569,6 +2570,21 @@ static int check_dev_super(struct btrfs_device *dev)
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
 
+	/* Verify the checksum. */
+	csum_type = btrfs_super_csum_type(sb);
+	if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+		btrfs_err(fs_info, "csum type changed, has %u expect %u",
+			  csum_type, btrfs_super_csum_type(fs_info->super_copy));
+		ret = -EUCLEAN;
+		goto out;
+	}
+
+	if (btrfs_check_super_csum(fs_info, sb)) {
+		btrfs_err(fs_info, "csum for on-disk super block no longer matches");
+		ret = -EUCLEAN;
+		goto out;
+	}
+
 	/* Btrfs_validate_super() includes fsid check against super->fsid. */
 	ret = btrfs_validate_super(fs_info, sb, 0);
 	if (ret < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 9b8be45f1ef29081c4b614aa559f934526e70d16 Mon Sep 17 00:00:00 2001
From: BingJing Chang <bingjingc@synology.com>
Date: Sun, 16 Oct 2022 23:33:46 +0800
Subject: btrfs: send: fix send failure of a subcase of orphan inodes

Commit 9ed0a72e5b35 ("btrfs: send: fix failures when processing inodes with
no links") tries to fix all incremental send cases of orphan inodes the
send operation will meet. However, there's still a bug causing the corner
subcase fails with a ENOENT error.

Here's shortened steps of that subcase:

  $ btrfs subvolume create vol
  $ touch vol/foo

  $ btrfs subvolume snapshot -r vol snap1
  $ btrfs subvolume snapshot -r vol snap2

  # Turn the second snapshot to RW mode and delete the file while
  # holding an open file descriptor on it
  $ btrfs property set snap2 ro false
  $ exec 73<snap2/foo
  $ rm snap2/foo

  # Set the second snapshot back to RO mode and do an incremental send
  # with an unusal reverse order
  $ btrfs property set snap2 ro true
  $ btrfs send -p snap2 snap1 > /dev/null
  At subvol snap1
  ERROR: send ioctl failed with -2: No such file or directory

It's subcase 3 of BTRFS_COMPARE_TREE_CHANGED in the commit 9ed0a72e5b35
("btrfs: send: fix failures when processing inodes with no links"). And
it's not a common case. We still have not met it in the real world.
Theoretically, this case can happen in a batch cascading snapshot backup.
In cascading backups, the receive operation in the middle may cause orphan
inodes to appear because of the open file descriptors on the snapshot files
during receiving. And if we don't do the batch snapshot backups in their
creation order, then we can have an inode, which is an orphan in the parent
snapshot but refers to a file in the send snapshot. Since an orphan inode
has no paths, the send operation will fail with a ENOENT error if it
tries to generate a path for it.

In that patch, this subcase will be treated as an inode with a new
generation. However, when the routine tries to delete the old paths in
the parent snapshot, the function process_all_refs() doesn't check whether
there are paths recorded or not before it calls the function
process_recorded_refs(). And the function process_recorded_refs() try
to get the first path in the parent snapshot in the beginning. Since it has
no paths in the parent snapshot, the send operation fails.

To fix this, we can easily put a link count check to avoid entering the
deletion routine like what we do a link count check to avoid creating a
new one. Moreover, we can assume that the function process_all_refs()
can always collect references to process because we know it has a
positive link count.

Fixes: 9ed0a72e5b35 ("btrfs: send: fix failures when processing inodes with no links")
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: BingJing Chang <bingjingc@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ec6e1752af2c..145c84b44fd0 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6668,17 +6668,19 @@ static int changed_inode(struct send_ctx *sctx,
 			/*
 			 * First, process the inode as if it was deleted.
 			 */
-			sctx->cur_inode_gen = right_gen;
-			sctx->cur_inode_new = false;
-			sctx->cur_inode_deleted = true;
-			sctx->cur_inode_size = btrfs_inode_size(
-					sctx->right_path->nodes[0], right_ii);
-			sctx->cur_inode_mode = btrfs_inode_mode(
-					sctx->right_path->nodes[0], right_ii);
-			ret = process_all_refs(sctx,
-					BTRFS_COMPARE_TREE_DELETED);
-			if (ret < 0)
-				goto out;
+			if (old_nlinks > 0) {
+				sctx->cur_inode_gen = right_gen;
+				sctx->cur_inode_new = false;
+				sctx->cur_inode_deleted = true;
+				sctx->cur_inode_size = btrfs_inode_size(
+						sctx->right_path->nodes[0], right_ii);
+				sctx->cur_inode_mode = btrfs_inode_mode(
+						sctx->right_path->nodes[0], right_ii);
+				ret = process_all_refs(sctx,
+						BTRFS_COMPARE_TREE_DELETED);
+				if (ret < 0)
+					goto out;
+			}
 
 			/*
 			 * Now process the inode as if it was new.
-- 
cgit v1.2.3-59-g8ed1b


From 2398091f9c2c8e0040f4f9928666787a3e8108a7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 18 Oct 2022 16:05:52 +0200
Subject: btrfs: fix type of parameter generation in btrfs_get_dentry

The type of parameter generation has been u32 since the beginning,
however all callers pass a u64 generation, so unify the types to prevent
potential loss.

CC: stable@vger.kernel.org # 4.9+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/export.c | 2 +-
 fs/btrfs/export.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1d4c2397d0d6..fab7eb76e53b 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 }
 
 struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
-				u64 root_objectid, u32 generation,
+				u64 root_objectid, u64 generation,
 				int check_generation)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f4113c976..5afb7ca42828 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,7 +19,7 @@ struct btrfs_fid {
 } __attribute__ ((packed));
 
 struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
-				u64 root_objectid, u32 generation,
+				u64 root_objectid, u64 generation,
 				int check_generation);
 struct dentry *btrfs_get_parent(struct dentry *child);
 
-- 
cgit v1.2.3-59-g8ed1b


From 76a66ba101329316a5d7f4275070be22eb85fdf2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 21 Oct 2022 08:43:45 +0800
Subject: btrfs: don't use btrfs_chunk::sub_stripes from disk

[BUG]
There are two reports (the earliest one from LKP, a more recent one from
kernel bugzilla) that we can have some chunks with 0 as sub_stripes.

This will cause divide-by-zero errors at btrfs_rmap_block, which is
introduced by a recent kernel patch ac0677348f3c ("btrfs: merge
calculations for simple striped profiles in btrfs_rmap_block"):

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
			stripe_nr = stripe_nr * map->num_stripes + i;
			stripe_nr = div_u64(stripe_nr, map->sub_stripes); <<<
		}

[CAUSE]
From the more recent report, it has been proven that we have some chunks
with 0 as sub_stripes, mostly caused by older mkfs.

It turns out that the mkfs.btrfs fix is only introduced in 6718ab4d33aa
("btrfs-progs: Initialize sub_stripes to 1 in btrfs_alloc_data_chunk")
which is included in v5.4 btrfs-progs release.

So there would be quite some old filesystems with such 0 sub_stripes.

[FIX]
Just don't trust the sub_stripes values from disk.

We have a trusted btrfs_raid_array[] to fetch the correct sub_stripes
numbers for each profile and that are fixed.

By this, we can keep the compatibility with older filesystems while
still avoid divide-by-zero bugs.

Reported-by: kernel test robot <oliver.sang@intel.com>
Reported-by: Viktor Kuzmin <kvaster@gmail.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216559
Fixes: ac0677348f3c ("btrfs: merge calculations for simple striped profiles in btrfs_rmap_block")
CC: stable@vger.kernel.org # 6.0
Reviewed-by: Su Yue <glass@fydeos.io>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 94ba46d57920..a8d4bc6a1937 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7142,6 +7142,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 	u64 devid;
 	u64 type;
 	u8 uuid[BTRFS_UUID_SIZE];
+	int index;
 	int num_stripes;
 	int ret;
 	int i;
@@ -7149,6 +7150,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
 	type = btrfs_chunk_type(leaf, chunk);
+	index = btrfs_bg_flags_to_raid_index(type);
 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 
 #if BITS_PER_LONG == 32
@@ -7202,7 +7204,15 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	map->type = type;
-	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+	/*
+	 * We can't use the sub_stripes value, as for profiles other than
+	 * RAID10, they may have 0 as sub_stripes for filesystems created by
+	 * older mkfs (<v5.4).
+	 * In that case, it can cause divide-by-zero errors later.
+	 * Since currently sub_stripes is fixed for each profile, let's
+	 * use the trusted value instead.
+	 */
+	map->sub_stripes = btrfs_raid_array[index].sub_stripes;
 	map->verified_stripes = 0;
 	em->orig_block_len = btrfs_calc_stripe_length(em);
 	for (i = 0; i < num_stripes; i++) {
-- 
cgit v1.2.3-59-g8ed1b