From 2b90883c561ddcc641741c2e4df1f702a4f2acb8 Mon Sep 17 00:00:00 2001
From: Johnny Chang <johnnyc@synology.com>
Date: Fri, 26 Apr 2019 11:01:05 +0800
Subject: btrfs: Check the compression level before getting a workspace

When a file's compression property is set as zlib or zstd but leave
the compression mount option not be set, that means btrfs will try
to compress the file with default compression level. But in
btrfs_compress_pages(), it calls get_workspace() with level = 0.
This will return a workspace with a wrong compression level.
For zlib, the compression level in the workspace will be 0
(that means "store only"). And for zstd, the compression in the
workspace will be 1, not the default level 3.

How to reproduce:
  mkfs -t btrfs /dev/sdb
  mount /dev/sdb /mnt/
  mkdir /mnt/zlib
  btrfs property set /mnt/zlib/ compression zlib
  dd if=/dev/zero of=/mnt/zlib/compression-friendly-file-10M bs=1M count=10
  sync
  btrfs-debugfs -f /mnt/zlib/compression-friendly-file-10M

btrfs-debugfs output:
* before:
  ...
  (258 9961472): ram 524288 disk 1106247680 disk_size 524288
  file: ... extents 20 disk size 10485760 logical size 10485760 ratio 1.00

* after:
 ...
 (258 10354688): ram 131072 disk 14217216 disk_size 4096
 file: ... extents 80 disk size 327680 logical size 10485760 ratio 32.00

The steps for zstd are similar, but need to put a debugging message to
show the level of the return workspace in zstd_get_workspace().

This commit adds a check of the compression level before getting a
workspace by set_level().

CC: stable@vger.kernel.org # 5.1+
Signed-off-by: Johnny Chang <johnnyc@synology.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1463e14af2fb..4ec1df369e47 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1009,6 +1009,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
 	struct list_head *workspace;
 	int ret;
 
+	level = btrfs_compress_op[type]->set_level(level);
 	workspace = get_workspace(type, level);
 	ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
 						      start, pages,
-- 
cgit v1.2.3-59-g8ed1b


From 8fca955057b9c58467d1b231e43f19c4cf26ae8c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 May 2019 11:10:06 -0400
Subject: btrfs: don't double unlock on error in btrfs_punch_hole

If we have an error writing out a delalloc range in
btrfs_punch_hole_lock_range we'll unlock the inode and then goto
out_only_mutex, where we will again unlock the inode.  This is bad,
don't do this.

Fixes: f27451f22996 ("Btrfs: add support for fallocate's zero range operation")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7e85dca0e6f2..9dbea72a61fe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2554,10 +2554,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
 					  &cached_state);
-	if (ret) {
-		inode_unlock(inode);
+	if (ret)
 		goto out_only_mutex;
-	}
 
 	path = btrfs_alloc_path();
 	if (!path) {
-- 
cgit v1.2.3-59-g8ed1b


From d7400ee1b476f201f8fb4264887d18bdb23ee352 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 26 Apr 2019 16:25:50 +0200
Subject: btrfs: use the existing reserved items for our first prop for
 inheritance

We're now reserving an extra items worth of space for property
inheritance.  We only have one property at the moment so this covers us,
but if we add more in the future this will allow us to not get bitten by
the extra space reservation.  If we do add more properties in the future
we should re-visit how we calculate the space reservation needs by the
callers.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ refreshed on top of prop/xattr cleanups ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/props.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index ca2716917e37..a9e2e66152ee 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -332,6 +332,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	int i;
+	bool need_reserve = false;
 
 	if (!test_bit(BTRFS_INODE_HAS_PROPS,
 		      &BTRFS_I(parent)->runtime_flags))
@@ -357,11 +358,20 @@ static int inherit_props(struct btrfs_trans_handle *trans,
 		if (ret)
 			continue;
 
-		num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-		ret = btrfs_block_rsv_add(root, trans->block_rsv,
-					  num_bytes, BTRFS_RESERVE_NO_FLUSH);
-		if (ret)
-			return ret;
+		/*
+		 * Currently callers should be reserving 1 item for properties,
+		 * since we only have 1 property that we currently support.  If
+		 * we add more in the future we need to try and reserve more
+		 * space for them.  But we should also revisit how we do space
+		 * reservations if we do add more properties in the future.
+		 */
+		if (need_reserve) {
+			num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+			ret = btrfs_block_rsv_add(root, trans->block_rsv,
+					num_bytes, BTRFS_RESERVE_NO_FLUSH);
+			if (ret)
+				return ret;
+		}
 
 		ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
 				     strlen(value), 0);
@@ -375,9 +385,13 @@ static int inherit_props(struct btrfs_trans_handle *trans,
 					&BTRFS_I(inode)->runtime_flags);
 		}
 
-		btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes);
-		if (ret)
-			return ret;
+		if (need_reserve) {
+			btrfs_block_rsv_release(fs_info, trans->block_rsv,
+					num_bytes);
+			if (ret)
+				return ret;
+		}
+		need_reserve = true;
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 72bd2323ec87722c115a5906bc6a1b31d11e8f54 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 29 Apr 2019 13:08:14 +0100
Subject: Btrfs: do not abort transaction at btrfs_update_root() after failure
 to COW path

Currently when we fail to COW a path at btrfs_update_root() we end up
always aborting the transaction. However all the current callers of
btrfs_update_root() are able to deal with errors returned from it, many do
end up aborting the transaction themselves (directly or not, such as the
transaction commit path), other BUG_ON() or just gracefully cancel whatever
they were doing.

When syncing the fsync log, we call btrfs_update_root() through
tree-log.c:update_log_root(), and if it returns an -ENOSPC error, the log
sync code does not abort the transaction, instead it gracefully handles
the error and returns -EAGAIN to the fsync handler, so that it falls back
to a transaction commit. Any other error different from -ENOSPC, makes the
log sync code abort the transaction.

So remove the transaction abort from btrfs_update_log() when we fail to
COW a path to update the root item, so that if an -ENOSPC failure happens
we avoid aborting the current transaction and have a chance of the fsync
succeeding after falling back to a transaction commit.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203413
Fixes: 79787eaab46121 ("btrfs: replace many BUG_ONs with proper error handling")
Cc: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/root-tree.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1b9a5d0de139..22124122728c 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -132,10 +132,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		return -ENOMEM;
 
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-	if (ret < 0) {
-		btrfs_abort_transaction(trans, ret);
+	if (ret < 0)
 		goto out;
-	}
 
 	if (ret > 0) {
 		btrfs_crit(fs_info,
-- 
cgit v1.2.3-59-g8ed1b


From 450ff8348808a89cc27436771aa05c2b90c0eef1 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <tobin@kernel.org>
Date: Mon, 13 May 2019 13:39:11 +1000
Subject: btrfs: sysfs: Fix error path kobject memory leak

If a call to kobject_init_and_add() fails we must call kobject_put()
otherwise we leak memory.

Calling kobject_put() when kobject_init_and_add() fails drops the
refcount back to 0 and calls the ktype release method (which in turn
calls the percpu destroy and kfree).

Add call to kobject_put() in the error path of call to
kobject_init_and_add().

Cc: stable@vger.kernel.org # v4.4+
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Tobin C. Harding <tobin@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f79e477a378e..9bcb3570750e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3882,8 +3882,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
 				    info->space_info_kobj, "%s",
 				    alloc_name(space_info->flags));
 	if (ret) {
-		percpu_counter_destroy(&space_info->total_bytes_pinned);
-		kfree(space_info);
+		kobject_put(&space_info->kobj);
 		return ret;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From e32773357d5cc271b1d23550b3ed026eb5c2a468 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <tobin@kernel.org>
Date: Mon, 13 May 2019 13:39:12 +1000
Subject: btrfs: sysfs: don't leak memory when failing add fsid

A failed call to kobject_init_and_add() must be followed by a call to
kobject_put().  Currently in the error path when adding fs_devices we
are missing this call.  This could be fixed by calling
btrfs_sysfs_remove_fsid() if btrfs_sysfs_add_fsid() returns an error or
by adding a call to kobject_put() directly in btrfs_sysfs_add_fsid().
Here we choose the second option because it prevents the slightly
unusual error path handling requirements of kobject from leaking out
into btrfs functions.

Add a call to kobject_put() in the error path of kobject_add_and_init().
This causes the release method to be called if kobject_init_and_add()
fails.  open_tree() is the function that calls btrfs_sysfs_add_fsid()
and the error code in this function is already written with the
assumption that the release method is called during the error path of
open_tree() (as seen by the call to btrfs_sysfs_remove_fsid() under the
fail_fsdev_sysfs label).

Cc: stable@vger.kernel.org # v4.4+
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Tobin C. Harding <tobin@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5a5930e3d32b..2f078b77fe14 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -825,7 +825,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 	fs_devs->fsid_kobj.kset = btrfs_kset;
 	error = kobject_init_and_add(&fs_devs->fsid_kobj,
 				&btrfs_ktype, parent, "%pU", fs_devs->fsid);
-	return error;
+	if (error) {
+		kobject_put(&fs_devs->fsid_kobj);
+		return error;
+	}
+
+	return 0;
 }
 
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
-- 
cgit v1.2.3-59-g8ed1b


From 14ae4ec1ee1466b701e0518f9acbb0bbd8ab0684 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 10 May 2019 12:45:05 +0800
Subject: btrfs: extent-tree: Fix a bug that btrfs is unable to add pinned
 bytes

Commit ddf30cf03fb5 ("btrfs: extent-tree: Use btrfs_ref to refactor
add_pinned_bytes()") refactored add_pinned_bytes(), but during that
refactor, there are two callers which add the pinned bytes instead
of subtracting.

That refactor misses those two caller, causing incorrect pinned bytes
calculation and resulting unexpected ENOSPC error.

Fix it by adding a new parameter @sign to restore the original behavior.

Reported-by: kernel test robot <rong.a.chen@intel.com>
Fixes: ddf30cf03fb5 ("btrfs: extent-tree: Use btrfs_ref to refactor add_pinned_bytes()")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9bcb3570750e..1aee51a9f3bf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -757,12 +757,14 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 }
 
 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
-			     struct btrfs_ref *ref)
+			     struct btrfs_ref *ref, int sign)
 {
 	struct btrfs_space_info *space_info;
-	s64 num_bytes = -ref->len;
+	s64 num_bytes;
 	u64 flags;
 
+	ASSERT(sign == 1 || sign == -1);
+	num_bytes = sign * ref->len;
 	if (ref->type == BTRFS_REF_METADATA) {
 		if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
@@ -2063,7 +2065,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_ref_tree_mod(fs_info, generic_ref);
 
 	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
-		add_pinned_bytes(fs_info, generic_ref);
+		add_pinned_bytes(fs_info, generic_ref, -1);
 
 	return ret;
 }
@@ -7189,7 +7191,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	}
 out:
 	if (pin)
-		add_pinned_bytes(fs_info, &generic_ref);
+		add_pinned_bytes(fs_info, &generic_ref, 1);
 
 	if (last_ref) {
 		/*
@@ -7237,7 +7239,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 		btrfs_ref_tree_mod(fs_info, ref);
 
 	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
-		add_pinned_bytes(fs_info, ref);
+		add_pinned_bytes(fs_info, ref, 1);
 
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From ebb929060aeb162417b4c1307e63daee47b208d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 May 2019 16:43:51 +0100
Subject: Btrfs: avoid fallback to transaction commit during fsync of files
 with holes

When we are doing a full fsync (bit BTRFS_INODE_NEEDS_FULL_SYNC set) of a
file that has holes and has file extent items spanning two or more leafs,
we can end up falling to back to a full transaction commit due to a logic
bug that leads to failure to insert a duplicate file extent item that is
meant to represent a hole between the last file extent item of a leaf and
the first file extent item in the next leaf. The failure (EEXIST error)
leads to a transaction commit (as most errors when logging an inode do).

For example, we have the two following leafs:

Leaf N:

  -----------------------------------------------
  | ..., ..., ..., (257, FILE_EXTENT_ITEM, 64K) |
  -----------------------------------------------
  The file extent item at the end of leaf N has a length of 4Kb,
  representing the file range from 64K to 68K - 1.

Leaf N + 1:

  -----------------------------------------------
  | (257, FILE_EXTENT_ITEM, 72K), ..., ..., ... |
  -----------------------------------------------
  The file extent item at the first slot of leaf N + 1 has a length of
  4Kb too, representing the file range from 72K to 76K - 1.

During the full fsync path, when we are at tree-log.c:copy_items() with
leaf N as a parameter, after processing the last file extent item, that
represents the extent at offset 64K, we take a look at the first file
extent item at the next leaf (leaf N + 1), and notice there's a 4K hole
between the two extents, and therefore we insert a file extent item
representing that hole, starting at file offset 68K and ending at offset
72K - 1. However we don't update the value of *last_extent, which is used
to represent the end offset (plus 1, non-inclusive end) of the last file
extent item inserted in the log, so it stays with a value of 68K and not
with a value of 72K.

Then, when copy_items() is called for leaf N + 1, because the value of
*last_extent is smaller then the offset of the first extent item in the
leaf (68K < 72K), we look at the last file extent item in the previous
leaf (leaf N) and see it there's a 4K gap between it and our first file
extent item (again, 68K < 72K), so we decide to insert a file extent item
representing the hole, starting at file offset 68K and ending at offset
72K - 1, this insertion will fail with -EEXIST being returned from
btrfs_insert_file_extent() because we already inserted a file extent item
representing a hole for this offset (68K) in the previous call to
copy_items(), when processing leaf N.

The -EEXIST error gets propagated to the fsync callback, btrfs_sync_file(),
which falls back to a full transaction commit.

Fix this by adjusting *last_extent after inserting a hole when we had to
look at the next leaf.

Fixes: 4ee3fad34a9c ("Btrfs: fix fsync after hole punching when using no-holes feature")
Cc: stable@vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6adcd8a2c5c7..6c47f6ed3e94 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4182,6 +4182,7 @@ fill_holes:
 							       *last_extent, 0,
 							       0, len, 0, len,
 							       0, 0, 0);
+				*last_extent += len;
 			}
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 0c713cbab6200b0ab6473b50435e450a6e1de85d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 May 2019 16:44:02 +0100
Subject: Btrfs: fix race between ranged fsync and writeback of adjacent ranges

When we do a full fsync (the bit BTRFS_INODE_NEEDS_FULL_SYNC is set in the
inode) that happens to be ranged, which happens during a msync() or writes
for files opened with O_SYNC for example, we can end up with a corrupt log,
due to different file extent items representing ranges that overlap with
each other, or hit some assertion failures.

When doing a ranged fsync we only flush delalloc and wait for ordered
exents within that range. If while we are logging items from our inode
ordered extents for adjacent ranges complete, we end up in a race that can
make us insert the file extent items that overlap with others we logged
previously and the assertion failures.

For example, if tree-log.c:copy_items() receives a leaf that has the
following file extents items, all with a length of 4K and therefore there
is an implicit hole in the range 68K to 72K - 1:

  (257 EXTENT_ITEM 64K), (257 EXTENT_ITEM 72K), (257 EXTENT_ITEM 76K), ...

It copies them to the log tree. However due to the need to detect implicit
holes, it may release the path, in order to look at the previous leaf to
detect an implicit hole, and then later it will search again in the tree
for the first file extent item key, with the goal of locking again the
leaf (which might have changed due to concurrent changes to other inodes).

However when it locks again the leaf containing the first key, the key
corresponding to the extent at offset 72K may not be there anymore since
there is an ordered extent for that range that is finishing (that is,
somewhere in the middle of btrfs_finish_ordered_io()), and it just
removed the file extent item but has not yet replaced it with a new file
extent item, so the part of copy_items() that does hole detection will
decide that there is a hole in the range starting from 68K to 76K - 1,
and therefore insert a file extent item to represent that hole, having
a key offset of 68K. After that we now have a log tree with 2 different
extent items that have overlapping ranges:

 1) The file extent item copied before copy_items() released the path,
    which has a key offset of 72K and a length of 4K, representing the
    file range 72K to 76K - 1.

 2) And a file extent item representing a hole that has a key offset of
    68K and a length of 8K, representing the range 68K to 76K - 1. This
    item was inserted after releasing the path, and overlaps with the
    extent item inserted before.

The overlapping extent items can cause all sorts of unpredictable and
incorrect behaviour, either when replayed or if a fast (non full) fsync
happens later, which can trigger a BUG_ON() when calling
btrfs_set_item_key_safe() through __btrfs_drop_extents(), producing a
trace like the following:

  [61666.783269] ------------[ cut here ]------------
  [61666.783943] kernel BUG at fs/btrfs/ctree.c:3182!
  [61666.784644] invalid opcode: 0000 [#1] PREEMPT SMP
  (...)
  [61666.786253] task: ffff880117b88c40 task.stack: ffffc90008168000
  [61666.786253] RIP: 0010:btrfs_set_item_key_safe+0x7c/0xd2 [btrfs]
  [61666.786253] RSP: 0018:ffffc9000816b958 EFLAGS: 00010246
  [61666.786253] RAX: 0000000000000000 RBX: 000000000000000f RCX: 0000000000030000
  [61666.786253] RDX: 0000000000000000 RSI: ffffc9000816ba4f RDI: ffffc9000816b937
  [61666.786253] RBP: ffffc9000816b998 R08: ffff88011dae2428 R09: 0000000000001000
  [61666.786253] R10: 0000160000000000 R11: 6db6db6db6db6db7 R12: ffff88011dae2418
  [61666.786253] R13: ffffc9000816ba4f R14: ffff8801e10c4118 R15: ffff8801e715c000
  [61666.786253] FS:  00007f6060a18700(0000) GS:ffff88023f5c0000(0000) knlGS:0000000000000000
  [61666.786253] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [61666.786253] CR2: 00007f6060a28000 CR3: 0000000213e69000 CR4: 00000000000006e0
  [61666.786253] Call Trace:
  [61666.786253]  __btrfs_drop_extents+0x5e3/0xaad [btrfs]
  [61666.786253]  ? time_hardirqs_on+0x9/0x14
  [61666.786253]  btrfs_log_changed_extents+0x294/0x4e0 [btrfs]
  [61666.786253]  ? release_extent_buffer+0x38/0xb4 [btrfs]
  [61666.786253]  btrfs_log_inode+0xb6e/0xcdc [btrfs]
  [61666.786253]  ? lock_acquire+0x131/0x1c5
  [61666.786253]  ? btrfs_log_inode_parent+0xee/0x659 [btrfs]
  [61666.786253]  ? arch_local_irq_save+0x9/0xc
  [61666.786253]  ? btrfs_log_inode_parent+0x1f5/0x659 [btrfs]
  [61666.786253]  btrfs_log_inode_parent+0x223/0x659 [btrfs]
  [61666.786253]  ? arch_local_irq_save+0x9/0xc
  [61666.786253]  ? lockref_get_not_zero+0x2c/0x34
  [61666.786253]  ? rcu_read_unlock+0x3e/0x5d
  [61666.786253]  btrfs_log_dentry_safe+0x60/0x7b [btrfs]
  [61666.786253]  btrfs_sync_file+0x317/0x42c [btrfs]
  [61666.786253]  vfs_fsync_range+0x8c/0x9e
  [61666.786253]  SyS_msync+0x13c/0x1c9
  [61666.786253]  entry_SYSCALL_64_fastpath+0x18/0xad

A sample of a corrupt log tree leaf with overlapping extents I got from
running btrfs/072:

      item 14 key (295 108 200704) itemoff 2599 itemsize 53
              extent data disk bytenr 0 nr 0
              extent data offset 0 nr 458752 ram 458752
      item 15 key (295 108 659456) itemoff 2546 itemsize 53
              extent data disk bytenr 4343541760 nr 770048
              extent data offset 606208 nr 163840 ram 770048
      item 16 key (295 108 663552) itemoff 2493 itemsize 53
              extent data disk bytenr 4343541760 nr 770048
              extent data offset 610304 nr 155648 ram 770048
      item 17 key (295 108 819200) itemoff 2440 itemsize 53
              extent data disk bytenr 4334788608 nr 4096
              extent data offset 0 nr 4096 ram 4096

The file extent item at offset 659456 (item 15) ends at offset 823296
(659456 + 163840) while the next file extent item (item 16) starts at
offset 663552.

Another different problem that the race can trigger is a failure in the
assertions at tree-log.c:copy_items(), which expect that the first file
extent item key we found before releasing the path exists after we have
released path and that the last key we found before releasing the path
also exists after releasing the path:

  $ cat -n fs/btrfs/tree-log.c
  4080          if (need_find_last_extent) {
  4081                  /* btrfs_prev_leaf could return 1 without releasing the path */
  4082                  btrfs_release_path(src_path);
  4083                  ret = btrfs_search_slot(NULL, inode->root, &first_key,
  4084                                  src_path, 0, 0);
  4085                  if (ret < 0)
  4086                          return ret;
  4087                  ASSERT(ret == 0);
  (...)
  4103                  if (i >= btrfs_header_nritems(src_path->nodes[0])) {
  4104                          ret = btrfs_next_leaf(inode->root, src_path);
  4105                          if (ret < 0)
  4106                                  return ret;
  4107                          ASSERT(ret == 0);
  4108                          src = src_path->nodes[0];
  4109                          i = 0;
  4110                          need_find_last_extent = true;
  4111                  }
  (...)

The second assertion implicitly expects that the last key before the path
release still exists, because the surrounding while loop only stops after
we have found that key. When this assertion fails it produces a stack like
this:

  [139590.037075] assertion failed: ret == 0, file: fs/btrfs/tree-log.c, line: 4107
  [139590.037406] ------------[ cut here ]------------
  [139590.037707] kernel BUG at fs/btrfs/ctree.h:3546!
  [139590.038034] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI
  [139590.038340] CPU: 1 PID: 31841 Comm: fsstress Tainted: G        W         5.0.0-btrfs-next-46 #1
  (...)
  [139590.039354] RIP: 0010:assfail.constprop.24+0x18/0x1a [btrfs]
  (...)
  [139590.040397] RSP: 0018:ffffa27f48f2b9b0 EFLAGS: 00010282
  [139590.040730] RAX: 0000000000000041 RBX: ffff897c635d92c8 RCX: 0000000000000000
  [139590.041105] RDX: 0000000000000000 RSI: ffff897d36a96868 RDI: ffff897d36a96868
  [139590.041470] RBP: ffff897d1b9a0708 R08: 0000000000000000 R09: 0000000000000000
  [139590.041815] R10: 0000000000000008 R11: 0000000000000000 R12: 0000000000000013
  [139590.042159] R13: 0000000000000227 R14: ffff897cffcbba88 R15: 0000000000000001
  [139590.042501] FS:  00007f2efc8dee80(0000) GS:ffff897d36a80000(0000) knlGS:0000000000000000
  [139590.042847] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [139590.043199] CR2: 00007f8c064935e0 CR3: 0000000232252002 CR4: 00000000003606e0
  [139590.043547] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [139590.043899] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [139590.044250] Call Trace:
  [139590.044631]  copy_items+0xa3f/0x1000 [btrfs]
  [139590.045009]  ? generic_bin_search.constprop.32+0x61/0x200 [btrfs]
  [139590.045396]  btrfs_log_inode+0x7b3/0xd70 [btrfs]
  [139590.045773]  btrfs_log_inode_parent+0x2b3/0xce0 [btrfs]
  [139590.046143]  ? do_raw_spin_unlock+0x49/0xc0
  [139590.046510]  btrfs_log_dentry_safe+0x4a/0x70 [btrfs]
  [139590.046872]  btrfs_sync_file+0x3b6/0x440 [btrfs]
  [139590.047243]  btrfs_file_write_iter+0x45b/0x5c0 [btrfs]
  [139590.047592]  __vfs_write+0x129/0x1c0
  [139590.047932]  vfs_write+0xc2/0x1b0
  [139590.048270]  ksys_write+0x55/0xc0
  [139590.048608]  do_syscall_64+0x60/0x1b0
  [139590.048946]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [139590.049287] RIP: 0033:0x7f2efc4be190
  (...)
  [139590.050342] RSP: 002b:00007ffe743243a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
  [139590.050701] RAX: ffffffffffffffda RBX: 0000000000008d58 RCX: 00007f2efc4be190
  [139590.051067] RDX: 0000000000008d58 RSI: 00005567eca0f370 RDI: 0000000000000003
  [139590.051459] RBP: 0000000000000024 R08: 0000000000000003 R09: 0000000000008d60
  [139590.051863] R10: 0000000000000078 R11: 0000000000000246 R12: 0000000000000003
  [139590.052252] R13: 00000000003d3507 R14: 00005567eca0f370 R15: 0000000000000000
  (...)
  [139590.055128] ---[ end trace 193f35d0215cdeeb ]---

So fix this race between a full ranged fsync and writeback of adjacent
ranges by flushing all delalloc and waiting for all ordered extents to
complete before logging the inode. This is the simplest way to solve the
problem because currently the full fsync path does not deal with ranges
at all (it assumes a full range from 0 to LLONG_MAX) and it always needs
to look at adjacent ranges for hole detection. For use cases of ranged
fsyncs this can make a few fsyncs slower but on the other hand it can
make some following fsyncs to other ranges do less work or no need to do
anything at all. A full fsync is rare anyway and happens only once after
loading/creating an inode and once after less common operations such as a
shrinking truncate.

This is an issue that exists for a long time, and was often triggered by
generic/127, because it does mmap'ed writes and msync (which triggers a
ranged fsync). Adding support for the tree checker to detect overlapping
extents (next patch in the series) and trigger a WARN() when such cases
are found, and then calling btrfs_check_leaf_full() at the end of
btrfs_insert_file_extent() made the issue much easier to detect. Running
btrfs/072 with that change to the tree checker and making fsstress open
files always with O_SYNC made it much easier to trigger the issue (as
triggering it with generic/127 is very rare).

CC: stable@vger.kernel.org # 3.16+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9dbea72a61fe..89f5be2bfb43 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2067,6 +2067,18 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	int ret = 0, err;
 	u64 len;
 
+	/*
+	 * If the inode needs a full sync, make sure we use a full range to
+	 * avoid log tree corruption, due to hole detection racing with ordered
+	 * extent completion for adjacent ranges, and assertion failures during
+	 * hole detection.
+	 */
+	if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+		     &BTRFS_I(inode)->runtime_flags)) {
+		start = 0;
+		end = LLONG_MAX;
+	}
+
 	/*
 	 * The range length can be represented by u64, we have to do the typecasts
 	 * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
-- 
cgit v1.2.3-59-g8ed1b


From 4e9845eff5a8027b5181d5bff56a02991fe46d48 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 May 2019 16:44:12 +0100
Subject: Btrfs: tree-checker: detect file extent items with overlapping ranges

Having file extent items with ranges that overlap each other is a
serious issue that leads to all sorts of corruptions and crashes (like a
BUG_ON() during the course of __btrfs_drop_extents() when it traims file
extent items). Therefore teach the tree checker to detect such cases.
This is motivated by a recently fixed bug (race between ranged full
fsync and writeback or adjacent ranges).

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 49 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 748cd1598255..96fce4bef4e7 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -107,8 +107,26 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
 	(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment)));   \
 })
 
+static u64 file_extent_end(struct extent_buffer *leaf,
+			   struct btrfs_key *key,
+			   struct btrfs_file_extent_item *extent)
+{
+	u64 end;
+	u64 len;
+
+	if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) {
+		len = btrfs_file_extent_ram_bytes(leaf, extent);
+		end = ALIGN(key->offset + len, leaf->fs_info->sectorsize);
+	} else {
+		len = btrfs_file_extent_num_bytes(leaf, extent);
+		end = key->offset + len;
+	}
+	return end;
+}
+
 static int check_extent_data_item(struct extent_buffer *leaf,
-				  struct btrfs_key *key, int slot)
+				  struct btrfs_key *key, int slot,
+				  struct btrfs_key *prev_key)
 {
 	struct btrfs_fs_info *fs_info = leaf->fs_info;
 	struct btrfs_file_extent_item *fi;
@@ -188,6 +206,28 @@ static int check_extent_data_item(struct extent_buffer *leaf,
 	    CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
 	    CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
 		return -EUCLEAN;
+
+	/*
+	 * Check that no two consecutive file extent items, in the same leaf,
+	 * present ranges that overlap each other.
+	 */
+	if (slot > 0 &&
+	    prev_key->objectid == key->objectid &&
+	    prev_key->type == BTRFS_EXTENT_DATA_KEY) {
+		struct btrfs_file_extent_item *prev_fi;
+		u64 prev_end;
+
+		prev_fi = btrfs_item_ptr(leaf, slot - 1,
+					 struct btrfs_file_extent_item);
+		prev_end = file_extent_end(leaf, prev_key, prev_fi);
+		if (prev_end > key->offset) {
+			file_extent_err(leaf, slot - 1,
+"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
+					prev_end, key->offset);
+			return -EUCLEAN;
+		}
+	}
+
 	return 0;
 }
 
@@ -774,14 +814,15 @@ static int check_inode_item(struct extent_buffer *leaf,
  * Common point to switch the item-specific validation.
  */
 static int check_leaf_item(struct extent_buffer *leaf,
-			   struct btrfs_key *key, int slot)
+			   struct btrfs_key *key, int slot,
+			   struct btrfs_key *prev_key)
 {
 	int ret = 0;
 	struct btrfs_chunk *chunk;
 
 	switch (key->type) {
 	case BTRFS_EXTENT_DATA_KEY:
-		ret = check_extent_data_item(leaf, key, slot);
+		ret = check_extent_data_item(leaf, key, slot, prev_key);
 		break;
 	case BTRFS_EXTENT_CSUM_KEY:
 		ret = check_csum_item(leaf, key, slot);
@@ -928,7 +969,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
 			 * Check if the item size and content meet other
 			 * criteria
 			 */
-			ret = check_leaf_item(leaf, &key, slot);
+			ret = check_leaf_item(leaf, &key, slot, &prev_key);
 			if (ret < 0)
 				return ret;
 		}
-- 
cgit v1.2.3-59-g8ed1b