From beed9263f4000c48a5c48912f26576f6fa091181 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 13 Dec 2017 13:50:07 +0200 Subject: btrfs: Fix flush bio leak Commit e0ae99941423 ("btrfs: preallocate device flush bio") reworked the way the flush bio is allocated and used. Concretely it allocates the bio in __alloc_device and then re-uses it multiple times with a very simple endio routine that just calls complete() without consuming a reference. Allocated bios by default come with a ref count of 1, which is then consumed by the endio routine (or not, in which case they should be bio_put by the caller). The way the impleementation works now is that the flush bio has a refcount of 2 and we only ever bio_put it once, leaving it to hang indefinitely. Fix this by removing the extra bio_get in __alloc_device. Fixes: e0ae99941423 ("btrfs: preallocate device flush bio") Signed-off-by: Nikolay Borisov Reviewed-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d48b24e54366..94d28f549837 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -237,7 +237,6 @@ static struct btrfs_device *__alloc_device(void) kfree(dev); return ERR_PTR(-ENOMEM); } - bio_get(dev->flush_bio); INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); -- cgit v1.2.3-59-g8ed1b From ec35e48b286959991cdbb886f1bdeda4575c80b4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Dec 2017 11:58:27 -0800 Subject: btrfs: fix refcount_t usage when deleting btrfs_delayed_nodes refcounts have a generic implementation and an asm optimized one. The generic version has extra debugging to make sure that once a refcount goes to zero, refcount_inc won't increase it. The btrfs delayed inode code wasn't expecting this, and we're tripping over the warnings when the generic refcounts are used. We ended up with this race: Process A Process B btrfs_get_delayed_node() spin_lock(root->inode_lock) radix_tree_lookup() __btrfs_release_delayed_node() refcount_dec_and_test(&delayed_node->refs) our refcount is now zero refcount_add(2) <--- warning here, refcount unchanged spin_lock(root->inode_lock) radix_tree_delete() With the generic refcounts, we actually warn again when process B above tries to release his refcount because refcount_add() turned into a no-op. We saw this in production on older kernels without the asm optimized refcounts. The fix used here is to use refcount_inc_not_zero() to detect when the object is in the middle of being freed and return NULL. This is almost always the right answer anyway, since we usually end up pitching the delayed_node if it didn't have fresh data in it. This also changes __btrfs_release_delayed_node() to remove the extra check for zero refcounts before radix tree deletion. btrfs_get_delayed_node() was the only path that was allowing refcounts to go from zero to one. Fixes: 6de5f18e7b0da ("btrfs: fix refcount_t usage when deleting btrfs_delayed_node") CC: # 4.12+ Signed-off-by: Chris Mason Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 45 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5d73f79ded8b..056276101c63 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_lock(&root->inode_lock); node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ @@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_unlock(&root->inode_lock); return node; } - btrfs_inode->delayed_node = node; - /* can be accessed and cached in the inode */ - refcount_add(2, &node->refs); + + /* + * It's possible that we're racing into the middle of removing + * this node from the radix tree. In this case, the refcount + * was zero and it should never go back to one. Just return + * NULL like it was never in the radix at all; our release + * function is in the process of removing it. + * + * Some implementations of refcount_inc refuse to bump the + * refcount once it has hit zero. If we don't do this dance + * here, refcount_inc() may decide to just WARN_ONCE() instead + * of actually bumping the refcount. + * + * If this node is properly in the radix, we want to bump the + * refcount twice, once for the inode and once for this get + * operation. + */ + if (refcount_inc_not_zero(&node->refs)) { + refcount_inc(&node->refs); + btrfs_inode->delayed_node = node; + } else { + node = NULL; + } + spin_unlock(&root->inode_lock); return node; } @@ -254,17 +276,18 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (refcount_dec_and_test(&delayed_node->refs)) { - bool free = false; struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); - if (refcount_read(&delayed_node->refs) == 0) { - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); - free = true; - } + /* + * Once our refcount goes to zero, nobody is allowed to bump it + * back up. We can delete it now. + */ + ASSERT(refcount_read(&delayed_node->refs) == 0); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); - if (free) - kmem_cache_free(delayed_node_cache, delayed_node); + kmem_cache_free(delayed_node_cache, delayed_node); } } -- cgit v1.2.3-59-g8ed1b