From 9dcbeed4d7e11e1dcf5e55475de3754f0855d1c2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 9 Nov 2015 11:44:45 +0100
Subject: btrfs: fix signed overflows in btrfs_sync_file

The calculation of range length in btrfs_sync_file leads to signed
overflow. This was caught by PaX gcc SIZE_OVERFLOW plugin.

https://forums.grsecurity.net/viewtopic.php?f=1&t=4284

The fsync call passes 0 and LLONG_MAX, the range length does not fit to
loff_t and overflows, but the value is converted to u64 so it silently
works as expected.

The minimal fix is a typecast to u64, switching functions to take
(start, end) instead of (start, len) would be more intrusive.

Coccinelle script found that there's one more opencoded calculation of
the length.

<smpl>
@@
loff_t start, end;
@@
* end - start
</smpl>

CC: stable@vger.kernel.org
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3009d4536d38..8eb1f3c1b647 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1882,8 +1882,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct btrfs_log_ctx ctx;
 	int ret = 0;
 	bool full_sync = 0;
-	const u64 len = end - start + 1;
+	u64 len;
 
+	/*
+	 * The range length can be represented by u64, we have to do the typecasts
+	 * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
+	 */
+	len = (u64)end - (u64)start + 1;
 	trace_btrfs_sync_file(file, datasync);
 
 	/*
@@ -2071,8 +2076,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 			}
 		}
 		if (!full_sync) {
-			ret = btrfs_wait_ordered_range(inode, start,
-						       end - start + 1);
+			ret = btrfs_wait_ordered_range(inode, start, len);
 			if (ret) {
 				btrfs_end_transaction(trans, root);
 				goto out;
-- 
cgit v1.2.3-59-g8ed1b


From 89b6c8d1e4a178a347b94f339b959f02710e7060 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 10 Nov 2015 12:10:03 +0300
Subject: Btrfs: tests: checking for NULL instead of IS_ERR()

btrfs_alloc_dummy_root() return an error pointer on failure, it never
returns NULL.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tests/free-space-tests.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8c3d70c31ff..8b72b005bfb9 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -898,8 +898,10 @@ int btrfs_test_free_space_cache(void)
 	}
 
 	root = btrfs_alloc_dummy_root();
-	if (!root)
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
 		goto out;
+	}
 
 	root->fs_info = btrfs_alloc_dummy_fs_info();
 	if (!root->fs_info)
-- 
cgit v1.2.3-59-g8ed1b


From 8eab77ff167b62760d878f1d19312eb9f7d4c176 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 13 Nov 2015 23:57:16 +0000
Subject: Btrfs: use global reserve when deleting unused block group after
 ENOSPC

It's possible to reach a state where the cleaner kthread isn't able to
start a transaction to delete an unused block group due to lack of enough
free metadata space and due to lack of unallocated device space to allocate
a new metadata block group as well. If this happens try to use space from
the global block group reserve just like we do for unlink operations, so
that we don't reach a permanent state where starting a transaction for
filesystem operations (file creation, renames, etc) keeps failing with
-ENOSPC. Such an unfortunate state was observed on a machine where over
a dozen unused data block groups existed and the cleaner kthread was
failing to delete them due to ENOSPC error when attempting to start a
transaction, and even running balance with a -dusage=0 filter failed with
ENOSPC as well. Also unmounting and mounting again the filesystem didn't
help. Allowing the cleaner kthread to use the global block reserve to
delete the unused data block groups fixed the problem.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c | 14 ++++++++++++--
 fs/btrfs/inode.c       | 24 +-----------------------
 fs/btrfs/transaction.c | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.h |  4 ++++
 fs/btrfs/volumes.c     |  2 +-
 6 files changed, 52 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a2e73f6053a8..1573be6f9518 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3479,6 +3479,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+				struct btrfs_fs_info *fs_info);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start,
 			     struct extent_map *em);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index acf3ed11cfb6..78200932c1cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10256,6 +10256,17 @@ out:
 	return ret;
 }
 
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info)
+{
+	/*
+	 * 1 unit for adding the free space inode's orphan (located in the tree
+	 * of tree roots).
+	 */
+	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+							   1, 1);
+}
+
 /*
  * Process the unused_bgs list and remove any that don't have any allocated
  * space inside of them.
@@ -10322,8 +10333,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		 * Want to do this before we do anything else so we can recover
 		 * properly if we fail to join the transaction.
 		 */
-		/* 1 for btrfs_orphan_reserve_metadata() */
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_start_trans_remove_block_group(fs_info);
 		if (IS_ERR(trans)) {
 			btrfs_dec_block_group_ro(root, block_group);
 			ret = PTR_ERR(trans);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6e93349d8aa2..f82d1f4460dd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4046,9 +4046,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
  */
 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	int ret;
 
 	/*
 	 * 1 for the possible orphan item
@@ -4057,27 +4055,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 	 * 1 for the inode ref
 	 * 1 for the inode
 	 */
-	trans = btrfs_start_transaction(root, 5);
-	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
-		return trans;
-
-	if (PTR_ERR(trans) == -ENOSPC) {
-		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-
-		trans = btrfs_start_transaction(root, 0);
-		if (IS_ERR(trans))
-			return trans;
-		ret = btrfs_cond_migrate_bytes(root->fs_info,
-					       &root->fs_info->trans_block_rsv,
-					       num_bytes, 5);
-		if (ret) {
-			btrfs_end_transaction(trans, root);
-			return ERR_PTR(ret);
-		}
-		trans->block_rsv = &root->fs_info->trans_block_rsv;
-		trans->bytes_reserved = num_bytes;
-	}
-	return trans;
+	return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 418c6a2ad7d8..3367a3c6f214 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -592,6 +592,38 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	return start_transaction(root, num_items, TRANS_START,
 				 BTRFS_RESERVE_FLUSH_ALL);
 }
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+					struct btrfs_root *root,
+					unsigned int num_items,
+					int min_factor)
+{
+	struct btrfs_trans_handle *trans;
+	u64 num_bytes;
+	int ret;
+
+	trans = btrfs_start_transaction(root, num_items);
+	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+		return trans;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return trans;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+	ret = btrfs_cond_migrate_bytes(root->fs_info,
+				       &root->fs_info->trans_block_rsv,
+				       num_bytes,
+				       min_factor);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ERR_PTR(ret);
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	trans->bytes_reserved = num_bytes;
+
+	return trans;
+}
 
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 					struct btrfs_root *root,
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b05b2f64d913..0da21ca9b3fb 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -185,6 +185,10 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   unsigned int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+					struct btrfs_root *root,
+					unsigned int num_items,
+					int min_factor);
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 					struct btrfs_root *root,
 					unsigned int num_items);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d198dd3360d3..e0bd364e958d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2853,7 +2853,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
 	if (ret)
 		return ret;
 
-	trans = btrfs_start_transaction(root, 0);
+	trans = btrfs_start_trans_remove_block_group(root->fs_info);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		btrfs_std_error(root->fs_info, ret, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From 7fd01182d1a1412cd44a5474b9aa93548d4a73ae Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 13 Nov 2015 23:57:17 +0000
Subject: Btrfs: fix the number of transaction units needed to remove a block
 group

We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.

While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.

However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.

In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:

    http://www.spinics.net/lists/linux-btrfs/msg46070.html

So fix this by reserving all the necessary units of metadata.

Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/extent-tree.c | 37 ++++++++++++++++++++++++++++++++++---
 fs/btrfs/volumes.c     |  3 ++-
 3 files changed, 38 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1573be6f9518..d88994f71eae 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3480,7 +3480,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size);
 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
-				struct btrfs_fs_info *fs_info);
+				struct btrfs_fs_info *fs_info,
+				const u64 chunk_offset);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start,
 			     struct extent_map *em);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 78200932c1cf..e97d6d61cd42 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10257,14 +10257,44 @@ out:
 }
 
 struct btrfs_trans_handle *
-btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info)
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+				     const u64 chunk_offset)
 {
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	unsigned int num_items;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	read_unlock(&em_tree->lock);
+	ASSERT(em && em->start == chunk_offset);
+
 	/*
+	 * We need to reserve 3 + N units from the metadata space info in order
+	 * to remove a block group (done at btrfs_remove_chunk() and at
+	 * btrfs_remove_block_group()), which are used for:
+	 *
 	 * 1 unit for adding the free space inode's orphan (located in the tree
 	 * of tree roots).
+	 * 1 unit for deleting the block group item (located in the extent
+	 * tree).
+	 * 1 unit for deleting the free space item (located in tree of tree
+	 * roots).
+	 * N units for deleting N device extent items corresponding to each
+	 * stripe (located in the device tree).
+	 *
+	 * In order to remove a block group we also need to reserve units in the
+	 * system space info in order to update the chunk tree (update one or
+	 * more device items and remove one chunk item), but this is done at
+	 * btrfs_remove_chunk() through a call to check_system_chunk().
 	 */
+	map = (struct map_lookup *)em->bdev;
+	num_items = 3 + map->num_stripes;
+	free_extent_map(em);
+
 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
-							   1, 1);
+							   num_items, 1);
 }
 
 /*
@@ -10333,7 +10363,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		 * Want to do this before we do anything else so we can recover
 		 * properly if we fail to join the transaction.
 		 */
-		trans = btrfs_start_trans_remove_block_group(fs_info);
+		trans = btrfs_start_trans_remove_block_group(fs_info,
+						     block_group->key.objectid);
 		if (IS_ERR(trans)) {
 			btrfs_dec_block_group_ro(root, block_group);
 			ret = PTR_ERR(trans);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e0bd364e958d..45f20252efed 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2853,7 +2853,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
 	if (ret)
 		return ret;
 
-	trans = btrfs_start_trans_remove_block_group(root->fs_info);
+	trans = btrfs_start_trans_remove_block_group(root->fs_info,
+						     chunk_offset);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		btrfs_std_error(root->fs_info, ret, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From da02c6898952a2bc251dd51ed9f897e0a72a853e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 16 Nov 2015 16:50:13 +0100
Subject: btrfs: fix clashing number of the enhanced balance usage filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I've accidentally picked an already used number for the enhanced usage
filter represented by BTRFS_BALANCE_ARGS_USAGE_RANGE, clashing with
BTRFS_BALANCE_ARGS_CONVERT. Introduced during the development phase,
no backward compatibility issues.

Reported-by: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: bc3094673f22 ("btrfs: extend balance filter usage to take minimum and maximum")
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ec5712372732..d5c84f6b1353 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -382,7 +382,7 @@ struct map_lookup {
 #define BTRFS_BALANCE_ARGS_LIMIT	(1ULL << 5)
 #define BTRFS_BALANCE_ARGS_LIMIT_RANGE	(1ULL << 6)
 #define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
-#define BTRFS_BALANCE_ARGS_USAGE_RANGE	(1ULL << 8)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE	(1ULL << 10)
 
 #define BTRFS_BALANCE_ARGS_MASK			\
 	(BTRFS_BALANCE_ARGS_PROFILES |		\
-- 
cgit v1.2.3-59-g8ed1b


From 76a8efa171bf6cf37ffb83d3f62fed2e47e2abc8 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 17 Nov 2015 18:46:17 +0800
Subject: btrfs: Continue replace when set_block_ro failed

xfstests/011 failed in node with small_size filesystem.
Can be reproduced by following script:
  DEV_LIST="/dev/vdd /dev/vde"
  DEV_REPLACE="/dev/vdf"

  do_test()
  {
      local mkfs_opt="$1"
      local size="$2"

      dmesg -c >/dev/null
      umount $SCRATCH_MNT &>/dev/null

      echo  mkfs.btrfs -f $mkfs_opt "${DEV_LIST[*]}"
      mkfs.btrfs -f $mkfs_opt "${DEV_LIST[@]}" || return 1
      mount "${DEV_LIST[0]}" $SCRATCH_MNT

      echo -n "Writing big files"
      dd if=/dev/urandom of=$SCRATCH_MNT/t0 bs=1M count=1 >/dev/null 2>&1
      for ((i = 1; i <= size; i++)); do
          echo -n .
          /bin/cp $SCRATCH_MNT/t0 $SCRATCH_MNT/t$i || return 1
      done
      echo

      echo Start replace
      btrfs replace start -Bf "${DEV_LIST[0]}" "$DEV_REPLACE" $SCRATCH_MNT || {
          dmesg
          return 1
      }
      return 0
  }

  # Set size to value near fs size
  # for example, 1897 can trigger this bug in 2.6G device.
  #
  ./do_test "-d raid1 -m raid1" 1897

System will report replace fail with following warning in dmesg:
 [  134.710853] BTRFS: dev_replace from /dev/vdd (devid 1) to /dev/vdf started
 [  135.542390] BTRFS: btrfs_scrub_dev(/dev/vdd, 1, /dev/vdf) failed -28
 [  135.543505] ------------[ cut here ]------------
 [  135.544127] WARNING: CPU: 0 PID: 4080 at fs/btrfs/dev-replace.c:428 btrfs_dev_replace_start+0x398/0x440()
 [  135.545276] Modules linked in:
 [  135.545681] CPU: 0 PID: 4080 Comm: btrfs Not tainted 4.3.0 #256
 [  135.546439] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 [  135.547798]  ffffffff81c5bfcf ffff88003cbb3d28 ffffffff817fe7b5 0000000000000000
 [  135.548774]  ffff88003cbb3d60 ffffffff810a88f1 ffff88002b030000 00000000ffffffe4
 [  135.549774]  ffff88003c080000 ffff88003c082588 ffff88003c28ab60 ffff88003cbb3d70
 [  135.550758] Call Trace:
 [  135.551086]  [<ffffffff817fe7b5>] dump_stack+0x44/0x55
 [  135.551737]  [<ffffffff810a88f1>] warn_slowpath_common+0x81/0xc0
 [  135.552487]  [<ffffffff810a89e5>] warn_slowpath_null+0x15/0x20
 [  135.553211]  [<ffffffff81448c88>] btrfs_dev_replace_start+0x398/0x440
 [  135.554051]  [<ffffffff81412c3e>] btrfs_ioctl+0x1d2e/0x25c0
 [  135.554722]  [<ffffffff8114c7ba>] ? __audit_syscall_entry+0xaa/0xf0
 [  135.555506]  [<ffffffff8111ab36>] ? current_kernel_time64+0x56/0xa0
 [  135.556304]  [<ffffffff81201e3d>] do_vfs_ioctl+0x30d/0x580
 [  135.557009]  [<ffffffff8114c7ba>] ? __audit_syscall_entry+0xaa/0xf0
 [  135.557855]  [<ffffffff810011d1>] ? do_audit_syscall_entry+0x61/0x70
 [  135.558669]  [<ffffffff8120d1c1>] ? __fget_light+0x61/0x90
 [  135.559374]  [<ffffffff81202124>] SyS_ioctl+0x74/0x80
 [  135.559987]  [<ffffffff81809857>] entry_SYSCALL_64_fastpath+0x12/0x6f
 [  135.560842] ---[ end trace 2a5c1fc3205abbdd ]---

Reason:
 When big data writen to fs, the whole free space will be allocated
 for data chunk.
 And operation as scrub need to set_block_ro(), and when there is
 only one metadata chunk in system(or other metadata chunks
 are all full), the function will try to allocate a new chunk,
 and failed because no space in device.

Fix:
 When set_block_ro failed for metadata chunk, it is not a problem
 because scrub_lock paused commit_trancaction in same time, and
 metadata are always cowed, so the on-the-fly writepages will not
 write data into same place with scrub/replace.
 Let replace continue in this case is no problem.

Tested by above script, and xfstests/011, plus 100 times xfstests/070.

Changelog v1->v2:
1: Add detail comments in source and commit-message.
2: Add dmesg detail into commit-message.
3: Limit return value of -ENOSPC to be passed.
All suggested by: Filipe Manana <fdmanana@gmail.com>

Suggested-by: Filipe Manana <fdmanana@gmail.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2907a77fb1f6..6b3fd51d9a99 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3483,6 +3483,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	u64 length;
 	u64 chunk_offset;
 	int ret = 0;
+	int ro_set;
 	int slot;
 	struct extent_buffer *l;
 	struct btrfs_key key;
@@ -3568,7 +3569,21 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		scrub_pause_on(fs_info);
 		ret = btrfs_inc_block_group_ro(root, cache);
 		scrub_pause_off(fs_info);
-		if (ret) {
+
+		if (ret == 0) {
+			ro_set = 1;
+		} else if (ret == -ENOSPC) {
+			/*
+			 * btrfs_inc_block_group_ro return -ENOSPC when it
+			 * failed in creating new chunk for metadata.
+			 * It is not a problem for scrub/replace, because
+			 * metadata are always cowed, and our scrub paused
+			 * commit_transactions.
+			 */
+			ro_set = 0;
+		} else {
+			btrfs_warn(fs_info, "failed setting block group ro, ret=%d\n",
+				   ret);
 			btrfs_put_block_group(cache);
 			break;
 		}
@@ -3611,7 +3626,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		scrub_pause_off(fs_info);
 
-		btrfs_dec_block_group_ro(root, cache);
+		if (ro_set)
+			btrfs_dec_block_group_ro(root, cache);
 
 		btrfs_put_block_group(cache);
 		if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From 31388ab2edac833defa4193172edc1d409868bfb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 19 Nov 2015 11:35:17 +0100
Subject: btrfs: fix rcu warning during device replace

The test btrfs/011 triggers a rcu warning
Reviewed-by: Anand Jain <anand.jain@oracle.com>

===============================
[ INFO: suspicious RCU usage. ]
4.4.0-rc1-default+ #286 Tainted: G        W
-------------------------------
fs/btrfs/volumes.c:1977 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
4 locks held by btrfs/28786:

0:  (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+...}, at: [<ffffffffa00bc785>] btrfs_dev_replace_finishing+0x45/0xa00 [btrfs]
1:  (uuid_mutex){+.+.+.}, at: [<ffffffffa00bc84f>] btrfs_dev_replace_finishing+0x10f/0xa00 [btrfs]
2:  (&fs_devs->device_list_mutex){+.+.+.}, at: [<ffffffffa00bc868>] btrfs_dev_replace_finishing+0x128/0xa00 [btrfs]
3:  (&fs_info->chunk_mutex){+.+...}, at: [<ffffffffa00bc87d>] btrfs_dev_replace_finishing+0x13d/0xa00 [btrfs]

stack backtrace:
CPU: 0 PID: 28786 Comm: btrfs Tainted: G        W       4.4.0-rc1-default+ #286
Hardware name: Intel Corporation SandyBridge Platform/To be filled by O.E.M., BIOS ASNBCPT1.86C.0031.B00.1006301607 06/30/2010
0000000000000001 ffff8800a07dfb48 ffffffff8141d47b 0000000000000001
0000000000000001 0000000000000000 ffff8801464a4f00 ffff8800a07dfb78
ffffffff810cd883 ffff880146eb9400 ffff8800a3698600 ffff8800a33fe220
Call Trace:
[<ffffffff8141d47b>] dump_stack+0x4f/0x74
[<ffffffff810cd883>] lockdep_rcu_suspicious+0x103/0x140
[<ffffffffa0071261>] btrfs_rm_dev_replace_remove_srcdev+0x111/0x130 [btrfs]
[<ffffffff810d354d>] ? trace_hardirqs_on+0xd/0x10
[<ffffffff81449536>] ? __percpu_counter_sum+0x66/0x80
[<ffffffffa00bcc15>] btrfs_dev_replace_finishing+0x4d5/0xa00 [btrfs]
[<ffffffffa00bc96e>] ? btrfs_dev_replace_finishing+0x22e/0xa00 [btrfs]
[<ffffffffa00a8795>] ? btrfs_scrub_dev+0x415/0x6d0 [btrfs]
[<ffffffffa003ea69>] ? btrfs_start_transaction+0x9/0x20 [btrfs]
[<ffffffffa00bda79>] btrfs_dev_replace_start+0x339/0x590 [btrfs]
[<ffffffff81196aa5>] ? __might_fault+0x95/0xa0
[<ffffffffa0078638>] btrfs_ioctl_dev_replace+0x118/0x160 [btrfs]
[<ffffffff811409c6>] ? stack_trace_call+0x46/0x70
[<ffffffffa007c914>] ? btrfs_ioctl+0x24/0x1770 [btrfs]
[<ffffffffa007ce43>] btrfs_ioctl+0x553/0x1770 [btrfs]
[<ffffffff811409c6>] ? stack_trace_call+0x46/0x70
[<ffffffff811d6eb1>] ? do_vfs_ioctl+0x21/0x5a0
[<ffffffff811d6f1c>] do_vfs_ioctl+0x8c/0x5a0
[<ffffffff811e3336>] ? __fget_light+0x86/0xb0
[<ffffffff811e3369>] ? __fdget+0x9/0x20
[<ffffffff811d7451>] ? SyS_ioctl+0x21/0x80
[<ffffffff811d7483>] SyS_ioctl+0x53/0x80
[<ffffffff81b1efd7>] entry_SYSCALL_64_fastpath+0x12/0x6f

This is because of unprotected use of rcu_dereference in
btrfs_scratch_superblocks. We can't add rcu locks around the whole
function because we read the superblock.

The fix will use the rcu string buffer directly without the rcu locking.
Thi is safe as the device will not go away in the meantime. We're
holding the device list mutexes.

Restructuring the code to narrow down the rcu section turned out to be
impossible, we need to call filp_open (through update_dev_time) on the
buffer and this could call kmalloc/__might_sleep. We could call kstrdup
with GFP_ATOMIC but it's not absolutely necessary.

Fixes: 12b1c2637b6e (Btrfs: enhance btrfs_scratch_superblock to scratch all superblocks)
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 45f20252efed..f95493b70903 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1973,8 +1973,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	if (srcdev->writeable) {
 		fs_devices->rw_devices--;
 		/* zero out the old super if it is writable */
-		btrfs_scratch_superblocks(srcdev->bdev,
-					rcu_str_deref(srcdev->name));
+		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
 	}
 
 	if (srcdev->bdev)
@@ -2024,8 +2023,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
 
 	if (tgtdev->bdev) {
-		btrfs_scratch_superblocks(tgtdev->bdev,
-					rcu_str_deref(tgtdev->name));
+		btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
 		fs_info->fs_devices->open_devices--;
 	}
 	fs_info->fs_devices->num_devices--;
-- 
cgit v1.2.3-59-g8ed1b


From 020d5b7366fc03e4bf84142ae6f63031ac504e33 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 19 Nov 2015 10:57:20 +0000
Subject: Btrfs: fix race between scrub and block group deletion

Scrub can race with the cleaner kthread deleting block groups that are
unused (and with relocation too) leading to a failure with error -EINVAL
that gets returned to user space.

The following diagram illustrates how it happens:

              CPU 1                                 CPU 2

 cleaner kthread
   btrfs_delete_unused_bgs()

     gets block group X from
     fs_info->unused_bgs

     sets block group to RO

       btrfs_remove_chunk(bg X)

         deletes device extents

                                         scrub_enumerate_chunks()

                                           searches device tree using
                                           its commit root

                                           finds device extent for
                                           block group X

                                           gets block group X from the tree
                                           fs_info->block_group_cache_tree
                                           (via btrfs_lookup_block_group())

                                           sets bg X to RO (again)

          btrfs_remove_block_group(bg X)

            deletes block group from
            fs_info->block_group_cache_tree

            removes extent map from
            fs_info->mapping_tree

                                               scrub_chunk(offset X)

                                                 searches fs_info->mapping_tree
                                                 for extent map starting at
                                                 offset X

                                                    --> doesn't find any such
                                                        extent map
                                                    --> returns -EINVAL and scrub
                                                        errors out to userspace
                                                        with -EINVAL

Fix this by dealing with an extent map lookup failure as an indicator of
block group deletion.
Issue reproduced with fstest btrfs/071.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6b3fd51d9a99..68af3169d527 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3432,7 +3432,9 @@ out:
 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 					  struct btrfs_device *scrub_dev,
 					  u64 chunk_offset, u64 length,
-					  u64 dev_offset, int is_dev_replace)
+					  u64 dev_offset,
+					  struct btrfs_block_group_cache *cache,
+					  int is_dev_replace)
 {
 	struct btrfs_mapping_tree *map_tree =
 		&sctx->dev_root->fs_info->mapping_tree;
@@ -3445,8 +3447,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
 	read_unlock(&map_tree->map_tree.lock);
 
-	if (!em)
-		return -EINVAL;
+	if (!em) {
+		/*
+		 * Might have been an unused block group deleted by the cleaner
+		 * kthread or relocation.
+		 */
+		spin_lock(&cache->lock);
+		if (!cache->removed)
+			ret = -EINVAL;
+		spin_unlock(&cache->lock);
+
+		return ret;
+	}
 
 	map = (struct map_lookup *)em->bdev;
 	if (em->start != chunk_offset)
@@ -3592,7 +3604,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		dev_replace->cursor_left = found_key.offset;
 		dev_replace->item_needs_writeback = 1;
 		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
-				  found_key.offset, is_dev_replace);
+				  found_key.offset, cache, is_dev_replace);
 
 		/*
 		 * flush, submit all pending read and write bios, afterwards
-- 
cgit v1.2.3-59-g8ed1b


From 758f2dfcf8a249b1f1510aa32e625c2ec20642a3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 19 Nov 2015 11:45:48 +0000
Subject: Btrfs: fix scrub preventing unused block groups from being deleted

Currently scrub can race with the cleaner kthread when the later attempts
to delete an unused block group, and the result is preventing the cleaner
kthread from ever deleting later the block group - unless the block group
becomes used and unused again. The following diagram illustrates that
race:

              CPU 1                                 CPU 2

 cleaner kthread
   btrfs_delete_unused_bgs()

     gets block group X from
     fs_info->unused_bgs and
     removes it from that list

                                             scrub_enumerate_chunks()

                                               searches device tree using
                                               its commit root

                                               finds device extent for
                                               block group X

                                               gets block group X from the tree
                                               fs_info->block_group_cache_tree
                                               (via btrfs_lookup_block_group())

                                               sets bg X to RO

     sees the block group is
     already RO and therefore
     doesn't delete it nor adds
     it back to unused list

So fix this by making scrub add the block group again to the list of
unused block groups if the block group is still unused when it finished
scrubbing it and it hasn't been removed already.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/scrub.c       | 22 ++++++++++++++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d88994f71eae..a0165c6e6243 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3416,6 +3416,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int get_block_group_index(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e97d6d61cd42..8fd14b6f1d33 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -124,7 +124,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 	return (cache->flags & bits) == bits;
 }
 
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 {
 	atomic_inc(&cache->count);
 }
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 68af3169d527..b091d94ceef6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3641,6 +3641,28 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		if (ro_set)
 			btrfs_dec_block_group_ro(root, cache);
 
+		/*
+		 * We might have prevented the cleaner kthread from deleting
+		 * this block group if it was already unused because we raced
+		 * and set it to RO mode first. So add it back to the unused
+		 * list, otherwise it might not ever be deleted unless a manual
+		 * balance is triggered or it becomes used and unused again.
+		 */
+		spin_lock(&cache->lock);
+		if (!cache->removed && !cache->ro && cache->reserved == 0 &&
+		    btrfs_block_group_used(&cache->item) == 0) {
+			spin_unlock(&cache->lock);
+			spin_lock(&fs_info->unused_bgs_lock);
+			if (list_empty(&cache->bg_list)) {
+				btrfs_get_block_group(cache);
+				list_add_tail(&cache->bg_list,
+					      &fs_info->unused_bgs);
+			}
+			spin_unlock(&fs_info->unused_bgs_lock);
+		} else {
+			spin_unlock(&cache->lock);
+		}
+
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 036a9348dcd0060e3df8f5b5db2a59e7d7eaf1f5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 23 Nov 2015 15:25:16 +0000
Subject: Btrfs: fix race between cleaner kthread and space cache writeout

When a block group becomes unused and the cleaner kthread is currently
running, we can end up getting the current transaction aborted with error
-ENOENT when we try to commit the transaction, leading to the following
trace:

  [59779.258768] WARNING: CPU: 3 PID: 5990 at fs/btrfs/extent-tree.c:3740 btrfs_write_dirty_block_groups+0x17c/0x214 [btrfs]()
  [59779.272594] BTRFS: Transaction aborted (error -2)
  (...)
  [59779.291137] Call Trace:
  [59779.291621]  [<ffffffff812566f4>] dump_stack+0x4e/0x79
  [59779.292543]  [<ffffffff8104d0a6>] warn_slowpath_common+0x9f/0xb8
  [59779.293435]  [<ffffffffa04cb81f>] ? btrfs_write_dirty_block_groups+0x17c/0x214 [btrfs]
  [59779.295000]  [<ffffffff8104d107>] warn_slowpath_fmt+0x48/0x50
  [59779.296138]  [<ffffffffa04c2721>] ? write_one_cache_group.isra.32+0x77/0x82 [btrfs]
  [59779.297663]  [<ffffffffa04cb81f>] btrfs_write_dirty_block_groups+0x17c/0x214 [btrfs]
  [59779.299141]  [<ffffffffa0549b0d>] commit_cowonly_roots+0x1de/0x261 [btrfs]
  [59779.300359]  [<ffffffffa04dd5b6>] btrfs_commit_transaction+0x4c4/0x99c [btrfs]
  [59779.301805]  [<ffffffffa04b5df4>] btrfs_sync_fs+0x145/0x1ad [btrfs]
  [59779.302893]  [<ffffffff81196634>] sync_filesystem+0x7f/0x93
  (...)
  [59779.318186] ---[ end trace 577e2daff90da33a ]---

The following diagram illustrates a sequence of steps leading to this
problem:

       CPU 1                                             CPU 2

                           <at transaction N>

                                                        adds bg A to list
                                                        fs_info->unused_bgs

                                                        adds bg B to list
                                                        fs_info->unused_bgs

                           <transaction kthread
                            commits transaction N
                            and wakes up the
                            cleaner kthread>

  cleaner kthread
    delete_unused_bgs()

      sees bg A in list
      fs_info->unused_bgs

      btrfs_start_transaction()

                           <transaction N + 1 starts>

      deletes bg A

                                                        update_block_group(bg C)

                                                          --> adds bg C to list
                                                              fs_info->unused_bgs

      deletes bg B

      sees bg C in the list
      fs_info->unused_bgs

      btrfs_remove_chunk(bg C)
        btrfs_remove_block_group(bg C)

          --> checks if the block group
              is in a dirty list, and
              because it isn't now, it
              does nothing

          --> the block group item
              is deleted from the
              extent tree

                                                          --> adds bg C to list
                                                              transaction->dirty_bgs

                                                         some task calls
                                                         btrfs_commit_transaction(t N + 1)
                                                           commit_cowonly_roots()
                                                             btrfs_write_dirty_block_groups()
                                                               --> sees bg C in cur_trans->dirty_bgs
                                                               --> calls write_one_cache_group()
                                                                   which returns -ENOENT because
                                                                   it did not find the block group
                                                                   item in the extent tree
                                                               --> transaction aborte with -ENOENT
                                                                   because write_one_cache_group()
                                                                   returned that error

So fix this by adding a block group to the list of dirty block groups
before adding it to the list of unused block groups.

This happened on a stress test using fsstress plus concurrent calls to
fallocate 20G and truncate (releasing part of the space allocated with
fallocate).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8fd14b6f1d33..e563e5ab72b4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5915,19 +5915,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			set_extent_dirty(info->pinned_extents,
 					 bytenr, bytenr + num_bytes - 1,
 					 GFP_NOFS | __GFP_NOFAIL);
-			/*
-			 * No longer have used bytes in this block group, queue
-			 * it for deletion.
-			 */
-			if (old_val == 0) {
-				spin_lock(&info->unused_bgs_lock);
-				if (list_empty(&cache->bg_list)) {
-					btrfs_get_block_group(cache);
-					list_add_tail(&cache->bg_list,
-						      &info->unused_bgs);
-				}
-				spin_unlock(&info->unused_bgs_lock);
-			}
 		}
 
 		spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5939,6 +5926,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		}
 		spin_unlock(&trans->transaction->dirty_bgs_lock);
 
+		/*
+		 * No longer have used bytes in this block group, queue it for
+		 * deletion. We do this after adding the block group to the
+		 * dirty list to avoid races between cleaner kthread and space
+		 * cache writeout.
+		 */
+		if (!alloc && old_val == 0) {
+			spin_lock(&info->unused_bgs_lock);
+			if (list_empty(&cache->bg_list)) {
+				btrfs_get_block_group(cache);
+				list_add_tail(&cache->bg_list,
+					      &info->unused_bgs);
+			}
+			spin_unlock(&info->unused_bgs_lock);
+		}
+
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
 		bytenr += num_bytes;
-- 
cgit v1.2.3-59-g8ed1b


From 967ef5131e42d6e3bb216c44161d893048a49957 Mon Sep 17 00:00:00 2001
From: Justin Maggard <jmaggard10@gmail.com>
Date: Fri, 6 Nov 2015 10:36:42 -0800
Subject: btrfs: qgroup: fix quota disable during rescan

There's a race condition that leads to a NULL pointer dereference if you
disable quotas while a quota rescan is running.  To fix this, we just need
to wait for the quota rescan worker to actually exit before tearing down
the quota structures.

Signed-off-by: Justin Maggard <jmaggard@netgear.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93e12c18ffd7..fd0a196c8f74 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -993,9 +993,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_root)
 		goto out;
-	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_enabled = 0;
 	fs_info->pending_quota_state = 0;
+	btrfs_qgroup_wait_for_completion(fs_info);
+	spin_lock(&fs_info->qgroup_lock);
 	quota_root = fs_info->quota_root;
 	fs_info->quota_root = NULL;
 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
-- 
cgit v1.2.3-59-g8ed1b


From 2d9e97761087b46192c18181dfd1e7a930defcfd Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 5 Nov 2015 14:37:58 -0800
Subject: Btrfs: use btrfs_get_fs_root in resolve_indirect_ref

The backref code will look up the fs_root we're trying to resolve our indirect
refs for, unfortunately we use btrfs_read_fs_root_no_name, which returns -ENOENT
if the ref is 0.  This isn't helpful for the qgroup stuff with snapshot delete
as it won't be able to search down the snapshot we are deleting, which will
cause us to miss roots.  So use btrfs_get_fs_root and send false for check_ref
so we can always get the root we're looking for.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6dcdb2ec9211..d453d62ab0c6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -355,7 +355,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 
 	index = srcu_read_lock(&fs_info->subvol_srcu);
 
-	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	root = btrfs_get_fs_root(fs_info, &root_key, false);
 	if (IS_ERR(root)) {
 		srcu_read_unlock(&fs_info->subvol_srcu, index);
 		ret = PTR_ERR(root);
-- 
cgit v1.2.3-59-g8ed1b


From 82bd101b5240d3d1c4078a8017917a40c0dcc514 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Thu, 5 Nov 2015 14:38:00 -0800
Subject: btrfs: qgroup: account shared subtree during snapshot delete

Commit 0ed4792 ('btrfs: qgroup: Switch to new extent-oriented qgroup
mechanism.') removed our qgroup accounting during
btrfs_drop_snapshot(). Predictably, this results in qgroup numbers
going bad shortly after a snapshot is removed.

Fix this by adding a dirty extent record when we encounter extents during
our shared subtree walk. This effectively restores the functionality we had
with the original shared subtree walking code in 1152651 (btrfs: qgroup:
account shared subtrees during snapshot delete).

The idea with the original patch (and this one) is that shared subtrees can
get skipped during drop_snapshot. The shared subtree walk then allows us a
chance to visit those extents and add them to the qgroup work for later
processing. This ultimately makes the accounting for drop snapshot work.

The new qgroup code nicely handles all the other extents during the tree
walk via the ref dec/inc functions so we don't have to add actions beyond
what we had originally.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 47 ++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/qgroup.c      |  2 ++
 2 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e563e5ab72b4..4b89680a1923 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8108,21 +8108,47 @@ reada:
 }
 
 /*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
  */
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 bytenr,
+				     u64 num_bytes)
+{
+	struct btrfs_qgroup_extent_record *qrecord;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+	if (!qrecord)
+		return -ENOMEM;
+
+	qrecord->bytenr = bytenr;
+	qrecord->num_bytes = num_bytes;
+	qrecord->old_roots = NULL;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+		kfree(qrecord);
+	spin_unlock(&delayed_refs->lock);
+
+	return 0;
+}
+
 static int account_leaf_items(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct extent_buffer *eb)
 {
 	int nr = btrfs_header_nritems(eb);
-	int i, extent_type;
+	int i, extent_type, ret;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	u64 bytenr, num_bytes;
 
+	/* We can be called directly from walk_up_proc() */
+	if (!root->fs_info->quota_enabled)
+		return 0;
+
 	for (i = 0; i < nr; i++) {
 		btrfs_item_key_to_cpu(eb, &key, i);
 
@@ -8141,6 +8167,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
 			continue;
 
 		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+		ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+		if (ret)
+			return ret;
 	}
 	return 0;
 }
@@ -8209,8 +8239,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
 
 /*
  * root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
  */
 static int account_shared_subtree(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
@@ -8288,6 +8316,11 @@ walk_down:
 			btrfs_tree_read_lock(eb);
 			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 			path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+			ret = record_one_subtree_extent(trans, root, child_bytenr,
+							root->nodesize);
+			if (ret)
+				goto out;
 		}
 
 		if (level == 0) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fd0a196c8f74..5279fdae7142 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1462,6 +1462,8 @@ struct btrfs_qgroup_extent_record
 	struct btrfs_qgroup_extent_record *entry;
 	u64 bytenr = record->bytenr;
 
+	assert_spin_locked(&delayed_refs->lock);
+
 	while (*p) {
 		parent_node = *p;
 		entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
-- 
cgit v1.2.3-59-g8ed1b


From dba72cb30b6a4811038128c8a98b268d18ca60fe Mon Sep 17 00:00:00 2001
From: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
Date: Tue, 17 Nov 2015 12:29:32 +0100
Subject: btrfs: fix balance range usage filters in 4.4-rc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a regression in 4.4-rc since commit bc3094673f22
(btrfs: extend balance filter usage to take minimum and maximum) in that
existing (non-ranged) balance with -dusage=x no longer works; all chunks
are skipped.

After staring at the code for a while and wondering why a non-ranged
balance would even need min and max thresholds (..which then were not
set correctly, leading to the bug) I realized that the only problem
was the fact that the filter functions were named wrong, thanks to
patching copypasta. Simply renaming both functions lets the existing
btrfs-progs call balance with -dusage=x and now the non-ranged filter
function is invoked, properly using only a single chunk limit.

Signed-off-by: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
Fixes: bc3094673f22 ("btrfs: extend balance filter usage to take minimum and maximum")
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f95493b70903..750285e4f274 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3122,7 +3122,7 @@ static int chunk_profiles_filter(u64 chunk_type,
 	return 1;
 }
 
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 			      struct btrfs_balance_args *bargs)
 {
 	struct btrfs_block_group_cache *cache;
@@ -3155,7 +3155,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 	return ret;
 }
 
-static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info,
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
 		u64 chunk_offset, struct btrfs_balance_args *bargs)
 {
 	struct btrfs_block_group_cache *cache;
-- 
cgit v1.2.3-59-g8ed1b