diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 2096 |
1 files changed, 1263 insertions, 833 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a7bc66121330..2801c991814f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,9 @@ #include "delalloc-space.h" #include "block-group.h" #include "discard.h" +#include "rcu-string.h" +#include "zoned.h" +#include "dev-replace.h" #undef SCRAMBLE_DELAYED_REFS @@ -64,10 +67,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - set_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + set_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); return 0; } @@ -79,50 +80,14 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) start = cache->start; end = start + cache->length - 1; - clear_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - clear_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); -} - -static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) -{ - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - return BTRFS_BLOCK_GROUP_SYSTEM; - else - return BTRFS_BLOCK_GROUP_METADATA; - } - return BTRFS_BLOCK_GROUP_DATA; -} - -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + clear_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); } /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { + struct btrfs_root *root = btrfs_extent_root(fs_info, start); int ret; struct btrfs_key key; struct btrfs_path *path; @@ -134,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) key.objectid = start; key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); btrfs_free_path(path); return ret; } @@ -152,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags) { + struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; @@ -189,7 +155,8 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + extent_root = btrfs_extent_root(fs_info, bytenr); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; @@ -207,7 +174,7 @@ search_again: if (ret == 0) { leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (item_size >= sizeof(*ei)) { ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -403,12 +370,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_BLOCK_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { @@ -417,12 +383,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_DATA_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else { @@ -432,8 +397,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } btrfs_print_leaf((struct extent_buffer *)eb); - btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", - eb->start, type); + btrfs_err(eb->fs_info, + "eb %llu iref 0x%lx invalid extent inline ref type %d", + eb->start, (unsigned long)iref, type); WARN_ON(1); return BTRFS_REF_TYPE_INVALID; @@ -480,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; @@ -556,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, int refs_to_add) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; u32 size; @@ -630,8 +596,9 @@ fail: } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop, int *last_ref) + int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -663,8 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs -= refs_to_drop; if (num_refs == 0) { - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); - *last_ref = 1; + ret = btrfs_del_item(trans, root, path); } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -722,7 +688,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { - struct btrfs_root *root = trans->fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -746,6 +712,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 parent, u64 root_objectid) { + struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; @@ -758,8 +725,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, key.offset = root_objectid; } - ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, - path, &key, 0); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(path); return ret; } @@ -824,7 +790,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 owner, u64 offset, int insert) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -848,6 +814,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); + path->search_for_extension = 1; path->keep_locks = 1; } else extra_size = -1; @@ -901,7 +868,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; btrfs_print_v0_err(fs_info); @@ -928,7 +895,13 @@ again: err = -ENOENT; while (1) { if (ptr >= end) { - WARN_ON(ptr > end); + if (ptr > end) { + err = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + } break; } iref = (struct btrfs_extent_inline_ref *)ptr; @@ -1000,6 +973,7 @@ again: out: if (insert) { path->keep_locks = 0; + path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } return err; @@ -1042,7 +1016,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, __run_delayed_extent_op(extent_op, leaf, ei); ptr = (unsigned long)ei + item_offset; - end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); if (ptr < end - size) memmove_extent_buffer(leaf, ptr + size, ptr, end - size - ptr); @@ -1103,8 +1077,7 @@ static noinline_for_stack void update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op, - int *last_ref) + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; @@ -1152,9 +1125,8 @@ void update_inline_extent_backref(struct btrfs_path *path, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { - *last_ref = 1; size = btrfs_extent_inline_ref_size(type); - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; end = (unsigned long)ei + item_size; if (ptr + size < end) @@ -1181,9 +1153,23 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - update_inline_extent_backref(path, iref, refs_to_add, - extent_op, NULL); + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", + bytenr, num_bytes, root_objectid); + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + WARN_ON(1); + btrfs_crit(trans->fs_info, + "path->slots[0]=%d path->nodes[0]:", path->slots[0]); + btrfs_print_leaf(path->nodes[0]); + } + return -EUCLEAN; + } + update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1193,42 +1179,21 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); - } else { - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); - } - return ret; -} - static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data, int *last_ref) + int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, - last_ref); - } else if (is_data) { - ret = remove_extent_data_ref(trans, path, refs_to_drop, - last_ref); - } else { - *last_ref = 1; - ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); - } + if (iref) + update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + else if (is_data) + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + else + ret = btrfs_del_item(trans, root, path); return ret; } @@ -1280,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (size) { ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) @@ -1297,13 +1262,53 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; } return ret; } +static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) +{ + struct btrfs_device *dev = stripe->dev; + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 phys = stripe->physical; + u64 len = stripe->length; + u64 discarded = 0; + int ret = 0; + + /* Zone reset on a zoned filesystem */ + if (btrfs_can_zone_reset(dev, phys, len)) { + u64 src_disc; + + ret = btrfs_reset_device_zone(dev, phys, len, &discarded); + if (ret) + goto out; + + if (!btrfs_dev_replace_is_ongoing(dev_replace) || + dev != dev_replace->srcdev) + goto out; + + src_disc = discarded; + + /* Send to replace target as well */ + ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, + &discarded); + discarded += src_disc; + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { + ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); + } else { + ret = 0; + *bytes = 0; + } + +out: + *bytes = discarded; + return ret; +} + int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { @@ -1311,80 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_bio *bbio = NULL; - /* - * Avoid races with device replace and make sure our bbio has devices - * associated to its stripes that don't go away while we are discarding. + * Avoid races with device replace and make sure the devices in the + * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_bio_stripe *stripe; + struct btrfs_discard_stripe *stripes; + unsigned int num_stripes; int i; num_bytes = end - cur; - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bbio, 0); - /* - * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or - * -EOPNOTSUPP. For any such error, @num_bytes is not updated, - * thus we can't continue anyway. - */ - if (ret < 0) - goto out; + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + if (IS_ERR(stripes)) { + ret = PTR_ERR(stripes); + if (ret == -EOPNOTSUPP) + ret = 0; + break; + } - stripe = bbio->stripes; - for (i = 0; i < bbio->num_stripes; i++, stripe++) { + for (i = 0; i < num_stripes; i++) { + struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; - struct request_queue *req_q; if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - req_q = bdev_get_queue(stripe->dev->bdev); - if (!blk_queue_discard(req_q)) + + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &stripe->dev->dev_state)) continue; - ret = btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - stripe->length, - &bytes); - if (!ret) { - discarded_bytes += bytes; - } else if (ret != -EOPNOTSUPP) { + ret = do_discard_extent(stripe, &bytes); + if (ret) { /* - * Logic errors or -ENOMEM, or -EIO, but - * unlikely to happen. - * - * And since there are two loops, explicitly - * go to out to avoid confusion. + * Keep going if discard is not supported by the + * device. */ - btrfs_put_bbio(bbio); - goto out; + if (ret != -EOPNOTSUPP) + break; + ret = 0; + } else { + discarded_bytes += bytes; } - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; } - btrfs_put_bbio(bbio); + kfree(stripes); + if (ret) + break; cur += num_bytes; } -out: btrfs_bio_counter_dec(fs_info); - if (actual_bytes) *actual_bytes = discarded_bytes; - - - if (ret == -EOPNOTSUPP) - ret = 0; return ret; } @@ -1393,32 +1378,29 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) - ret = btrfs_add_delayed_tree_ref(trans, generic_ref, - NULL, &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); else - ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); btrfs_ref_tree_mod(fs_info, generic_ref); - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - sub_pinned_bytes(fs_info, generic_ref); - return ret; } /* * __btrfs_inc_extent_ref - insert backreference for a given extent * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for @@ -1429,7 +1411,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * bytenr of the parent block. Since new extents are always * created with indirect references, this will only be the case * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must + * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must * be 0 * * @root_objectid: The id of the root where this modification has originated, @@ -1469,8 +1451,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; - path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, parent, root_objectid, owner, @@ -1494,11 +1474,16 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = READA_FORWARD; - path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); + } if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1574,6 +1559,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -1581,12 +1567,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = !extent_op->is_data; + int metadata = 1; - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); @@ -1603,10 +1589,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, key.offset = head->num_bytes; } + root = btrfs_extent_root(fs_info, key.objectid); again: - path->reada = READA_FORWARD; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { err = ret; goto out; @@ -1638,7 +1623,7 @@ again: } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { err = -EINVAL; @@ -1703,10 +1688,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, { int ret = 0; - if (trans->aborted) { + if (TRANS_ABORTED(trans)) { if (insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return 0; } @@ -1721,8 +1705,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return ret; } @@ -1798,34 +1781,15 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, { int nr_items = 1; /* Dropping this ref head update. */ - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; - - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - - /* - * We had csum deletions accounted for in our delayed refs rsv, - * we need to drop the csum leaves for this update from our - * delayed_refs_rsv. - */ - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, - head->num_bytes); - } + /* + * We had csum deletions accounted for in our delayed refs rsv, we need + * to drop the csum leaves for this update from our delayed_refs_rsv. + */ + if (head->total_ref_mod < 0 && head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); } btrfs_delayed_refs_rsv_release(fs_info, nr_items); @@ -1867,11 +1831,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info->csum_root, - head->bytenr, head->num_bytes); + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(fs_info, head->bytenr); + ret = btrfs_del_csums(trans, csum_root, head->bytenr, + head->num_bytes); } } @@ -1880,7 +1846,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); - return 0; + return ret; } static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( @@ -2135,41 +2101,6 @@ static u64 find_middle(struct rb_root *root) } #endif -static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) -{ - u64 num_bytes; - - num_bytes = heads * (sizeof(struct btrfs_extent_item) + - sizeof(struct btrfs_extent_inline_ref)); - if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - num_bytes += heads * sizeof(struct btrfs_tree_block_info); - - /* - * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to use. - */ - return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); -} - -/* - * Takes the number of bytes to be csumm'ed and figures out how many leaves it - * would require to store the csums for that many bytes. - */ -u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - u64 csum_size; - u64 num_csums_per_leaf; - u64 num_csums; - - csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); - num_csums_per_leaf = div64_u64(csum_size, - (u64)btrfs_super_csum_size(fs_info->super_copy)); - num_csums = div64_u64(csum_bytes, fs_info->sectorsize); - num_csums += num_csums_per_leaf - 1; - num_csums = div64_u64(num_csums, num_csums_per_leaf); - return num_csums; -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2191,7 +2122,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) @@ -2199,7 +2130,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (count == 0) - count = atomic_read(&delayed_refs->num_entries) * 2; + count = delayed_refs->num_heads_ready; again: #ifdef SCRAMBLE_DELAYED_REFS @@ -2238,8 +2169,8 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, - int level, int is_data) + struct extent_buffer *eb, u64 flags, + int level) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2251,10 +2182,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -2290,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { + if (path->nowait) { + spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); + return -EAGAIN; + } + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); @@ -2342,10 +2278,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root, static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 objectid, u64 offset, u64 bytenr, + bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref; struct btrfs_extent_inline_ref *iref; @@ -2376,7 +2313,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, goto out; ret = 1; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); + item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); /* If extent item has more than 1 inline ref then it's shared */ @@ -2384,9 +2321,13 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) goto out; - /* If extent created before last snapshot => it's definitely shared */ - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) + /* + * If extent created before last snapshot => it's shared unless the + * snapshot has been deleted. Use the heuristic if strict is false. + */ + if (!strict && + (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2411,18 +2352,13 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr) + u64 bytenr, bool strict, struct btrfs_path *path) { - struct btrfs_path *path; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - do { ret = check_committed_ref(root, path, objectid, - offset, bytenr); + offset, bytenr, strict); if (ret && ret != -ENOENT) goto out; @@ -2430,8 +2366,8 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, } while (ret == -EAGAIN); out: - btrfs_free_path(path); - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + btrfs_release_path(path); + if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; } @@ -2463,7 +2399,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) return 0; if (full_backref) @@ -2493,10 +2429,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset); - generic_ref.skip_qgroup = for_reloc; + key.offset, root->root_key.objectid, + for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -2508,9 +2443,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = fs_info->nodesize; btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); - generic_ref.skip_qgroup = for_reloc; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, + root->root_key.objectid, for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -2536,19 +2470,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group *block_group; - int readonly = 0; - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - if (!block_group || block_group->ro) - readonly = 1; - if (block_group) - btrfs_put_block_group(block_group); - return readonly; -} - static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -2566,29 +2487,27 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) return ret; } -static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { - struct btrfs_block_group *cache; - u64 bytenr; + struct rb_node *leftmost; + u64 bytenr = 0; - spin_lock(&fs_info->block_group_cache_lock); - bytenr = fs_info->first_logical_byte; - spin_unlock(&fs_info->block_group_cache_lock); + read_lock(&fs_info->block_group_cache_lock); + /* Get the block group with the lowest logical start address. */ + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); + if (leftmost) { + struct btrfs_block_group *bg; - if (bytenr < (u64)-1) - return bytenr; - - cache = btrfs_lookup_first_block_group(fs_info, search_start); - if (!cache) - return 0; - - bytenr = cache->start; - btrfs_put_block_group(cache); + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); + bytenr = bg->start; + } + read_unlock(&fs_info->block_group_cache_lock); return bytenr; } -static int pin_down_extent(struct btrfs_block_group *cache, +static int pin_down_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -2605,24 +2524,20 @@ static int pin_down_extent(struct btrfs_block_group *cache, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(fs_info->pinned_extents, bytenr, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; } -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group *cache; - ASSERT(fs_info->running_transaction); - - cache = btrfs_lookup_block_group(fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -2631,28 +2546,29 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; /* - * pull in the free space cache (if any) so that our pin - * removes the free space from the cache. We have load_only set - * to one because the slow code to read in the free extents does check - * the pinned extents. + * Fully cache the free space first so that our pin removes the free space + * from the cache. */ - btrfs_cache_block_group(cache, 1); + ret = btrfs_cache_block_group(cache, true); + if (ret) + goto out; - pin_down_extent(cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); +out: btrfs_put_block_group(cache); return ret; } @@ -2662,45 +2578,17 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, { int ret; struct btrfs_block_group *block_group; - struct btrfs_caching_control *caching_ctl; block_group = btrfs_lookup_block_group(fs_info, start); if (!block_group) return -EINVAL; - btrfs_cache_block_group(block_group, 0); - caching_ctl = btrfs_get_caching_control(block_group); - - if (!caching_ctl) { - /* Logic error */ - BUG_ON(!btrfs_block_group_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - } else { - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = btrfs_add_excluded_extent(fs_info, start, - num_bytes); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - if (ret) - goto out_lock; + ret = btrfs_cache_block_group(block_group, true); + if (ret) + goto out; - num_bytes = (start + num_bytes) - - caching_ctl->progress; - start = caching_ctl->progress; - ret = btrfs_add_excluded_extent(fs_info, start, - num_bytes); - } -out_lock: - mutex_unlock(&caching_ctl->mutex); - btrfs_put_caching_control(caching_ctl); - } + ret = btrfs_remove_free_space(block_group, start, num_bytes); +out: btrfs_put_block_group(block_group); return ret; } @@ -2743,36 +2631,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) atomic_inc(&bg->reservations); } -void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) -{ - struct btrfs_caching_control *next; - struct btrfs_caching_control *caching_ctl; - struct btrfs_block_group *cache; - - down_write(&fs_info->commit_root_sem); - - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - cache = caching_ctl->block_group; - if (btrfs_block_group_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - btrfs_put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } - - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - fs_info->pinned_extents = &fs_info->freed_extents[1]; - else - fs_info->pinned_extents = &fs_info->freed_extents[0]; - - up_write(&fs_info->commit_root_sem); - - btrfs_update_global_block_rsv(fs_info); -} - /* * Returns the free cluster for the given space info and sets empty_cluster to * what it should be based on the mount options. @@ -2834,11 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len = cache->start + cache->length - start; len = min(len, end + 1 - start); - if (start < cache->last_byte_to_unpin) { - len = min(len, cache->last_byte_to_unpin - start); - if (return_free_space) - btrfs_add_free_space(cache, start, len); - } + if (return_free_space) + btrfs_add_free_space(cache, start, len); start += len; total_unpinned += len; @@ -2862,21 +2717,22 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; + } else if (btrfs_is_zoned(fs_info)) { + /* Need reset before reusing in a zoned block group */ + space_info->bytes_zone_unusable += len; + readonly = true; } spin_unlock(&cache->lock); if (!readonly && return_free_space && global_rsv->space_info == space_info) { - u64 to_add = len; - spin_lock(&global_rsv->lock); if (!global_rsv->full) { - to_add = min(len, global_rsv->size - - global_rsv->reserved); + u64 to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add); @@ -2885,11 +2741,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len -= to_add; } spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_try_granting_tickets(fs_info, - space_info); } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } @@ -2908,12 +2763,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 end; int ret; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; + unpin = &trans->transaction->pinned_extents; - while (!trans->aborted) { + while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); @@ -2950,14 +2802,14 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 trimmed = 0; ret = -EROFS; - if (!trans->aborted) + if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, block_group->length, &trimmed); list_del_init(&block_group->bg_list); - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); if (ret) { @@ -2971,6 +2823,94 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +static int do_free_extent_accounting(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool is_data) +{ + int ret; + + if (is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(trans->fs_info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. + */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -2980,7 +2920,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; - struct btrfs_root *extent_root = info->extent_root; + struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -2993,18 +2933,25 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; - int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + extent_root = btrfs_extent_root(info, bytenr); + ASSERT(extent_root); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; - path->leave_spinning = 1; - is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } if (is_data) skinny_metadata = false; @@ -3013,6 +2960,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -3029,23 +2983,30 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - BUG_ON(iref); - ret = remove_extent_backref(trans, path, NULL, - refs_to_drop, - is_data, &last_ref); + if (iref) { + btrfs_crit(info, +"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } + /* Must be SHARED_* item, remove the backref first */ + ret = remove_extent_backref(trans, extent_root, path, + NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); - path->leave_spinning = 1; + /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -3108,7 +3069,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); + item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { ret = -EINVAL; btrfs_print_v0_err(info); @@ -3120,19 +3081,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + if (item_size < sizeof(*ei) + sizeof(*bi)) { + btrfs_crit(info, +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", + key.objectid, key.type, key.offset, + owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", + btrfs_crit(info, + "trying to drop %d refs but we only have %llu for bytenr %llu", refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; } refs -= refs_to_drop; @@ -3144,34 +3112,63 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - BUG_ON(!found_extent); + if (!found_extent) { + btrfs_crit(info, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); } if (found_extent) { - ret = remove_extent_backref(trans, path, iref, - refs_to_drop, is_data, - &last_ref); + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } } else { + /* In this branch refs == 1 */ if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + btrfs_crit(info, + "invalid refs_to_drop, current refs %u refs_to_drop %u", + extent_data_ref_count(path, iref), + refs_to_drop); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } if (iref) { - BUG_ON(path->slots[0] != extent_slot); + if (path->slots[0] != extent_slot) { + btrfs_crit(info, +"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", + key.objectid, key.type, + key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { - BUG_ON(path->slots[0] != extent_slot + 1); + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + btrfs_crit(info, + "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } path->slots[0] = extent_slot; num_to_del = 2; } } - last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -3180,32 +3177,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - if (is_data) { - ret = btrfs_del_csums(trans, info->csum_root, bytenr, - num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - - ret = add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); } btrfs_release_path(path); out: btrfs_free_path(path); return ret; +err_dump: + /* + * Leaf dump can take up a lot of log buffer, so we only do full leaf + * dump for debug build. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", + path->slots[0], extent_slot); + btrfs_print_leaf(path->nodes[0]); + } + + btrfs_free_path(path); + return -EUCLEAN; } /* @@ -3264,44 +3255,67 @@ out_delayed_unlock: } void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_ref generic_ref = { 0 }; - int pin = 1; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root->root_key.objectid); - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - int old_ref_mod, new_ref_mod; + root_id, 0, false); + if (root_id != BTRFS_TREE_LOG_OBJECTID) { btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ - pin = old_ref_mod >= 0 && new_ref_mod < 0; } if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group *cache; + bool must_pin = false; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); - if (!ret) + if (!ret) { + btrfs_redirty_list_add(trans->transaction, buf); goto out; + } } - pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(cache, buf->start, buf->len, 1); + pin_down_extent(trans, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + + /* + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. + */ + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + must_pin = true; + + if (must_pin || btrfs_is_zoned(fs_info)) { + btrfs_redirty_list_add(trans->transaction, buf); + pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -3314,9 +3328,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); } out: - if (pin) - add_pinned_bytes(fs_info, &generic_ref); - if (last_ref) { /* * Deleting the buffer, clear the corrupt flag since it doesn't @@ -3330,7 +3341,6 @@ out: int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) @@ -3341,30 +3351,24 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); - old_ref_mod = new_ref_mod = 0; + btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { - ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); } else { - ret = btrfs_add_delayed_data_ref(trans, ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, ref, 0); } if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref); - return ret; } @@ -3395,6 +3399,7 @@ static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) + __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; @@ -3438,13 +3443,20 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, +}; + /* * Structure used internally for find_free_extent() function. Wraps needed * parameters. */ struct find_free_extent_ctl { /* Basic allocation info */ + u64 ram_bytes; u64 num_bytes; + u64 min_alloc_size; u64 empty_size; u64 flags; int delalloc; @@ -3454,10 +3466,18 @@ struct find_free_extent_ctl { /* For clustered allocation */ u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; bool have_caching_bg; bool orig_have_caching_bg; + /* Allocation is called for tree-log */ + bool for_treelog; + + /* Allocation is called for data relocation */ + bool for_data_reloc; + /* RAID index, converted from flags */ int index; @@ -3489,6 +3509,12 @@ struct find_free_extent_ctl { /* Found result */ u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; }; @@ -3501,11 +3527,11 @@ struct find_free_extent_ctl { * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group **cluster_bg_ret) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **cluster_bg_ret) { struct btrfs_block_group *cluster_bg; + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 aligned_cluster; u64 offset; int ret; @@ -3605,9 +3631,9 @@ refill_cluster: * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 offset; /* @@ -3663,40 +3689,382 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, return 0; } +static int do_allocation_clustered(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + int ret; + + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); + if (ret >= 0 || ret == -EAGAIN) + return ret; + /* ret == -ENOENT case falls through */ + } + + return find_free_extent_unclustered(block_group, ffe_ctl); +} + +/* + * Tree-log block group locking + * ============================ + * + * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which + * indicates the starting address of a block group, which is reserved only + * for tree-log metadata. + * + * Lock nesting + * ============ + * + * space_info::lock + * block_group::lock + * fs_info::treelog_bg_lock + */ + +/* + * Simple allocator for sequential-only block group. It only allows sequential + * allocation. No need to play with trees. This function also reserves the + * bytes as in btrfs_add_reserved_bytes. + */ +static int do_allocation_zoned(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 start = block_group->start; + u64 num_bytes = ffe_ctl->num_bytes; + u64 avail; + u64 bytenr = block_group->start; + u64 log_bytenr; + u64 data_reloc_bytenr; + int ret = 0; + bool skip = false; + + ASSERT(btrfs_is_zoned(block_group->fs_info)); + + /* + * Do not allow non-tree-log blocks in the dedicated tree-log block + * group, and vice versa. + */ + spin_lock(&fs_info->treelog_bg_lock); + log_bytenr = fs_info->treelog_bg; + if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr))) + skip = true; + spin_unlock(&fs_info->treelog_bg_lock); + if (skip) + return 1; + + /* + * Do not allow non-relocation blocks in the dedicated relocation block + * group, and vice versa. + */ + spin_lock(&fs_info->relocation_bg_lock); + data_reloc_bytenr = fs_info->data_reloc_bg; + if (data_reloc_bytenr && + ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || + (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) + skip = true; + spin_unlock(&fs_info->relocation_bg_lock); + if (skip) + return 1; + + /* Check RO and no space case before trying to activate it */ + spin_lock(&block_group->lock); + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } + spin_unlock(&block_group->lock); + + if (!ret && !btrfs_zone_activate(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. + * Return the error after taking the locks. + */ + } + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + spin_lock(&fs_info->treelog_bg_lock); + spin_lock(&fs_info->relocation_bg_lock); + + if (ret) + goto out; + + ASSERT(!ffe_ctl->for_treelog || + block_group->start == fs_info->treelog_bg || + fs_info->treelog_bg == 0); + ASSERT(!ffe_ctl->for_data_reloc || + block_group->start == fs_info->data_reloc_bg || + fs_info->data_reloc_bg == 0); + + if (block_group->ro || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + ret = 1; + goto out; + } + + /* + * Do not allow currently using block group to be tree-log dedicated + * block group. + */ + if (ffe_ctl->for_treelog && !fs_info->treelog_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + /* + * Do not allow currently used block group to be the data relocation + * dedicated block group. + */ + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); + avail = block_group->zone_capacity - block_group->alloc_offset; + if (avail < num_bytes) { + if (ffe_ctl->max_extent_size < avail) { + /* + * With sequential allocator, free space is always + * contiguous + */ + ffe_ctl->max_extent_size = avail; + ffe_ctl->total_free_space = avail; + } + ret = 1; + goto out; + } + + if (ffe_ctl->for_treelog && !fs_info->treelog_bg) + fs_info->treelog_bg = block_group->start; + + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + + ffe_ctl->found_offset = start + block_group->alloc_offset; + block_group->alloc_offset += num_bytes; + spin_lock(&ctl->tree_lock); + ctl->free_space -= num_bytes; + spin_unlock(&ctl->tree_lock); + + /* + * We do not check if found_offset is aligned to stripesize. The + * address is anyway rewritten when using zone append writing. + */ + + ffe_ctl->search_start = ffe_ctl->found_offset; + +out: + if (ret && ffe_ctl->for_treelog) + fs_info->treelog_bg = 0; + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + fs_info->data_reloc_bg = 0; + } + spin_unlock(&fs_info->relocation_bg_lock); + spin_unlock(&fs_info->treelog_bg_lock); + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + return ret; +} + +static int do_allocation(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + case BTRFS_EXTENT_ALLOC_ZONED: + return do_allocation_zoned(block_group, ffe_ctl, bg_ret); + default: + BUG(); + } +} + +static void release_block_group(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + int delalloc) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; + default: + BUG(); + } + + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + ffe_ctl->index); + btrfs_release_block_group(block_group, delalloc); +} + +static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + if (!ffe_ctl->use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } +} + +static void found_extent(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + found_extent_clustered(ffe_ctl, ins); + break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; + default: + BUG(); + } +} + +static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + /* If we can activate new zone, just allocate a chunk and use it */ + if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return 0; + + /* + * We already reached the max active zones. Try to finish one block + * group to make a room for a new block group. This is only possible + * for a data block group because btrfs_zone_finish() may need to wait + * for a running transaction which can cause a deadlock for metadata + * allocation. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + int ret = btrfs_zone_finish_one_bg(fs_info); + + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + } + + /* + * If we have enough free space left in an already active block group + * and we can't activate any other zone now, do not allow allocating a + * new chunk and let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + + /* + * Even min_alloc_size is not left in any block groups. Since we cannot + * activate a new block group, allocating it may not help. Let's tell a + * caller to try again and hope it progress something by writing some + * parts of the region. That is only possible for data block groups, + * where a part of the region can be written. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) + return -EAGAIN; + + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, + * there is nothing to do anyway, so let's go with it. + */ + return 0; +} + +static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + return can_allocate_chunk_zoned(fs_info, ffe_ctl); + default: + BUG(); + } +} + +static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + /* + * If we can't allocate a new chunk we've already looped through + * at least once, move on to the NO_EMPTY_SIZE case. + */ + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Give up here */ + return -ENOSPC; + default: + BUG(); + } +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - struct btrfs_free_cluster *last_ptr, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, - int full_search, bool use_cluster) + bool full_search) { - struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *root = fs_info->chunk_root; int ret; if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) ffe_ctl->orig_have_caching_bg = true; - if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && - ffe_ctl->have_caching_bg) - return 1; - - if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) - return 1; - if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } + found_extent(ffe_ctl, ins); return 0; } + if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + return 1; + + ffe_ctl->index++; + if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) + return 1; + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along @@ -3725,6 +4093,11 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; int exist = 0; + /*Check if allocation policy allows to create a new chunk */ + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; + trans = current->journal_info; if (trans) exist = 1; @@ -3737,18 +4110,12 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); - - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ - if (ret < 0 && ret != -ENOSPC) + if (ret == -ENOSPC) + ret = chunk_allocation_failed(ffe_ctl); + else if (ret < 0) btrfs_abort_transaction(trans, ret); else ret = 0; @@ -3759,6 +4126,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) + return -ENOSPC; + /* * Don't loop again if we already have no empty_size and * no empty_cluster. @@ -3774,6 +4144,85 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + /* + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. + */ + if (space_info->max_extent_size) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + ffe_ctl->num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + ffe_ctl->use_cluster = false; + } + spin_unlock(&space_info->lock); + } + + ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl->empty_cluster); + if (ffe_ctl->last_ptr) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + ffe_ctl->hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + ffe_ctl->hint_byte = last_ptr->window_start; + ffe_ctl->use_cluster = false; + } + spin_unlock(&last_ptr->lock); + } + + return 0; +} + +static int prepare_allocation(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } + if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } + return 0; + default: + BUG(); + } +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -3799,93 +4248,63 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, - u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 flags, int delalloc) +static noinline int find_free_extent(struct btrfs_root *root, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; - struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; - struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - bool use_cluster = true; bool full_search = false; - WARN_ON(num_bytes < fs_info->sectorsize); + WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); - ffe_ctl.num_bytes = num_bytes; - ffe_ctl.empty_size = empty_size; - ffe_ctl.flags = flags; - ffe_ctl.search_start = 0; - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - ffe_ctl.delalloc = delalloc; - ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); - ffe_ctl.have_caching_bg = false; - ffe_ctl.orig_have_caching_bg = false; - ffe_ctl.found_offset = 0; + ffe_ctl->search_start = 0; + /* For clustered allocation */ + ffe_ctl->empty_cluster = 0; + ffe_ctl->last_ptr = NULL; + ffe_ctl->use_cluster = true; + ffe_ctl->have_caching_bg = false; + ffe_ctl->orig_have_caching_bg = false; + ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); + ffe_ctl->loop = 0; + /* For clustered allocation */ + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + ffe_ctl->cached = 0; + ffe_ctl->max_extent_size = 0; + ffe_ctl->total_free_space = 0; + ffe_ctl->found_offset = 0; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + + if (btrfs_is_zoned(fs_info)) + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); + trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, + ffe_ctl->flags); - space_info = btrfs_find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", flags); + btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); return -ENOSPC; } - /* - * If our free space is heavily fragmented we may not be able to make - * big contiguous allocations, so instead of doing the expensive search - * for free space, simply return ENOSPC with our max_extent_size so we - * can go ahead and search for a more manageable chunk. - * - * If our max_extent_size is large enough for our allocation simply - * disable clustering since we will likely not be able to find enough - * space to create a cluster and induce latency trying. - */ - if (unlikely(space_info->max_extent_size)) { - spin_lock(&space_info->lock); - if (space_info->max_extent_size && - num_bytes > space_info->max_extent_size) { - ins->offset = space_info->max_extent_size; - spin_unlock(&space_info->lock); - return -ENOSPC; - } else if (space_info->max_extent_size) { - use_cluster = false; - } - spin_unlock(&space_info->lock); - } - - last_ptr = fetch_cluster_info(fs_info, space_info, - &ffe_ctl.empty_cluster); - if (last_ptr) { - spin_lock(&last_ptr->lock); - if (last_ptr->block_group) - hint_byte = last_ptr->window_start; - if (last_ptr->fragmented) { - /* - * We still set window_start so we can keep track of the - * last place we found an allocation to try and save - * some time. - */ - hint_byte = last_ptr->window_start; - use_cluster = false; - } - spin_unlock(&last_ptr->lock); - } + ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); + if (ret < 0) + return ret; - ffe_ctl.search_start = max(ffe_ctl.search_start, - first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); - if (ffe_ctl.search_start == hint_byte) { + ffe_ctl->search_start = max(ffe_ctl->search_start, + first_logical_byte(fs_info)); + ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); + if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, - ffe_ctl.search_start); + ffe_ctl->search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -3893,7 +4312,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, flags) && + if (block_group && block_group_bits(block_group, ffe_ctl->flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -3907,9 +4326,10 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - ffe_ctl.index = btrfs_bg_flags_to_raid_index( - block_group->flags); - btrfs_lock_block_group(block_group, delalloc); + ffe_ctl->index = btrfs_bg_flags_to_raid_index( + block_group->flags); + btrfs_lock_block_group(block_group, + ffe_ctl->delalloc); goto have_block_group; } } else if (block_group) { @@ -3917,26 +4337,33 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, } } search: - ffe_ctl.have_caching_bg = false; - if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || - ffe_ctl.index == 0) + ffe_ctl->have_caching_bg = false; + if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || + ffe_ctl->index == 0) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, - &space_info->block_groups[ffe_ctl.index], list) { + &space_info->block_groups[ffe_ctl->index], list) { + struct btrfs_block_group *bg_ret; + /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) + if (unlikely(block_group->ro)) { + if (ffe_ctl->for_treelog) + btrfs_clear_treelog_bg(block_group); + if (ffe_ctl->for_data_reloc) + btrfs_clear_data_reloc_bg(block_group); continue; + } - btrfs_grab_block_group(block_group, delalloc); - ffe_ctl.search_start = block_group->start; + btrfs_grab_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->search_start = block_group->start; /* * this can happen if we end up cycling through all the * raid types, but we want to make sure we only allocate * for the proper type. */ - if (!block_group_bits(block_group, flags)) { + if (!block_group_bits(block_group, ffe_ctl->flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID56_MASK | @@ -3947,7 +4374,7 @@ search: * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ - if ((flags & extra) && !(block_group->flags & extra)) + if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) goto loop; /* @@ -3955,15 +4382,15 @@ search: * It's possible that we have MIXED_GROUP flag but no * block group is mixed. Just skip such block group. */ - btrfs_release_block_group(block_group, delalloc); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); continue; } have_block_group: - ffe_ctl.cached = btrfs_block_group_done(block_group); - if (unlikely(!ffe_ctl.cached)) { - ffe_ctl.have_caching_bg = true; - ret = btrfs_cache_block_group(block_group, 0); + ffe_ctl->cached = btrfs_block_group_done(block_group); + if (unlikely(!ffe_ctl->cached)) { + ffe_ctl->have_caching_bg = true; + ret = btrfs_cache_block_group(block_group, false); /* * If we get ENOMEM here or something else we want to @@ -3984,83 +4411,64 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - /* - * Ok we want to try and use the cluster allocator, so - * lets look there - */ - if (last_ptr && use_cluster) { - struct btrfs_block_group *cluster_bg = NULL; - - ret = find_free_extent_clustered(block_group, last_ptr, - &ffe_ctl, &cluster_bg); - - if (ret == 0) { - if (cluster_bg && cluster_bg != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = cluster_bg; - } - goto checks; - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { - goto loop; + bg_ret = NULL; + ret = do_allocation(block_group, ffe_ctl, &bg_ret); + if (ret == 0) { + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, + ffe_ctl->delalloc); + block_group = bg_ret; } - /* ret == -ENOENT case falls through */ - } - - ret = find_free_extent_unclustered(block_group, last_ptr, - &ffe_ctl); - if (ret == -EAGAIN) + } else if (ret == -EAGAIN) { goto have_block_group; - else if (ret > 0) + } else if (ret > 0) { goto loop; - /* ret == 0 case falls through */ -checks: - ffe_ctl.search_start = round_up(ffe_ctl.found_offset, - fs_info->stripesize); + } + + /* Checks */ + ffe_ctl->search_start = round_up(ffe_ctl->found_offset, + fs_info->stripesize); /* move on to the next group */ - if (ffe_ctl.search_start + num_bytes > + if (ffe_ctl->search_start + ffe_ctl->num_bytes > block_group->start + block_group->length) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } - if (ffe_ctl.found_offset < ffe_ctl.search_start) - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + if (ffe_ctl->found_offset < ffe_ctl->search_start) + btrfs_add_free_space_unused(block_group, + ffe_ctl->found_offset, + ffe_ctl->search_start - ffe_ctl->found_offset); - ret = btrfs_add_reserved_bytes(block_group, ram_bytes, - num_bytes, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, + ffe_ctl->num_bytes, + ffe_ctl->delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = ffe_ctl.search_start; - ins->offset = num_bytes; + ins->objectid = ffe_ctl->search_start; + ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, - num_bytes); - btrfs_release_block_group(block_group, delalloc); + trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, + ffe_ctl->num_bytes); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - ffe_ctl.index); - btrfs_release_block_group(block_group, delalloc); + release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, - full_search, use_cluster); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); if (ret > 0) goto search; @@ -4069,12 +4477,12 @@ loop: * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. */ - if (!ffe_ctl.max_extent_size) - ffe_ctl.max_extent_size = ffe_ctl.total_free_space; + if (!ffe_ctl->max_extent_size) + ffe_ctl->max_extent_size = ffe_ctl->total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = ffe_ctl.max_extent_size; + space_info->max_extent_size = ffe_ctl->max_extent_size; spin_unlock(&space_info->lock); - ins->offset = ffe_ctl.max_extent_size; + ins->offset = ffe_ctl->max_extent_size; } else if (ret == -ENOSPC) { ret = cache_block_group_error; } @@ -4132,15 +4540,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, struct btrfs_key *ins, int is_data, int delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; + struct find_free_extent_ctl ffe_ctl = {}; bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, - hint_byte, ins, flags, delalloc); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.min_alloc_size = min_alloc_size; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.delalloc = delalloc; + ffe_ctl.hint_byte = hint_byte; + ffe_ctl.for_treelog = for_treelog; + ffe_ctl.for_data_reloc = for_data_reloc; + + ret = find_free_extent(root, ins, &ffe_ctl); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); } else if (ret == -ENOSPC) { @@ -4158,8 +4579,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu", - flags, num_bytes); + "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", + flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); @@ -4189,28 +4610,53 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, + u64 len) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, start); if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); + btrfs_err(trans->fs_info, "unable to find block group for %llu", + start); return -ENOSPC; } - ret = pin_down_extent(cache, start, len, 1); + ret = pin_down_extent(trans, cache, start, len, 1); btrfs_put_block_group(cache); return ret; } +static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); + if (ret) { + ASSERT(!ret); + btrfs_err(fs_info, "update block group failed for %llu %llu", + bytenr, num_bytes); + return ret; + } + + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); + return 0; +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_extent_inline_ref *iref; @@ -4230,9 +4676,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); + extent_root = btrfs_extent_root(fs_info, ins->objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); if (ret) { btrfs_free_path(path); return ret; @@ -4265,18 +4710,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); - BUG(); - } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); - return ret; + return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, @@ -4284,6 +4718,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_key extent_key; @@ -4293,7 +4728,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes; u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -4303,21 +4737,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { extent_key.offset = ref->level; extent_key.type = BTRFS_METADATA_ITEM_KEY; - num_bytes = fs_info->nodesize; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); - num_bytes = node->num_bytes; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - &extent_key, size); + extent_root = btrfs_extent_root(fs_info, extent_key.objectid); + ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, + size); if (ret) { btrfs_free_path(path); return ret; @@ -4341,7 +4773,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); @@ -4354,22 +4785,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, extent_key.objectid, - num_bytes); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, 1); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - extent_key.objectid, extent_key.offset); - BUG(); - } - - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, - fs_info->nodesize); - return ret; + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -4378,17 +4794,16 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; - int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); - ret = btrfs_add_delayed_data_ref(trans, &generic_ref, - ram_bytes, NULL, NULL); - return ret; + + return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); } /* @@ -4431,19 +4846,21 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1); if (ret) - btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); btrfs_put_block_group(block_group); return ret; } static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; + u64 lockdep_owner = owner; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) return buf; @@ -4460,12 +4877,35 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ERR_PTR(-EUCLEAN); } - btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); - btrfs_tree_lock(buf); + /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + * + * The exception however is in replace_path() in relocation, where we + * hold the lock on the original fs root and then search for the reloc + * root. At that point we need to make sure any reloc root buffers are + * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make + * lockdep happy. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && + !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + /* btrfs_clean_tree_block() accesses generation field. */ + btrfs_set_header_generation(buf, trans->transid); + + /* + * This needs to stay, because we could allocate a freed block from an + * old tree into a new tree, so we need to make sure this new block is + * set to the appropriate level and owner. + */ + btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); + + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); - btrfs_set_lock_blocking_write(buf); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); @@ -4493,7 +4933,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -4507,7 +4946,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -4523,7 +4963,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); + level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -4540,7 +4980,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); + root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; @@ -4566,16 +5006,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_key = skinny_metadata ? false : true; extent_op->update_flags = true; - extent_op->is_data = false; extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins.objectid, ins.offset, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_init_tree_ref(&generic_ref, level, root_objectid, + root->root_key.objectid, false); btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, - extent_op, NULL, NULL); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) goto out_free_delayed; } @@ -4584,6 +5022,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, out_free_delayed: btrfs_free_delayed_extent_op(extent_op); out_free_buf: + btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); @@ -4685,7 +5124,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(fs_info, bytenr); + btrfs_readahead_node_child(eb, slot); nread++; } wc->reada_slot = slot; @@ -4750,9 +5189,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb->start, - eb->len, flag, - btrfs_header_level(eb), 0); + ret = btrfs_set_disk_extent_flags(trans, eb, flag, + btrfs_header_level(eb)); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -4845,16 +5283,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = find_extent_buffer(fs_info, bytenr); if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr, + root->root_key.objectid, level - 1); if (IS_ERR(next)) return PTR_ERR(next); - - btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, - level - 1); reada = 1; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], @@ -4905,8 +5340,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, generation, level - 1, - &first_key); + next = read_tree_block(fs_info, bytenr, root->root_key.objectid, + generation, level - 1, &first_key); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { @@ -4914,7 +5349,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return -EIO; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); } level--; @@ -4926,7 +5360,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -4994,7 +5428,8 @@ skip: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, fs_info->nodesize, parent); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; @@ -5054,8 +5489,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level]) { BUG_ON(level == 0); btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, @@ -5098,8 +5532,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; } btrfs_clean_tree_block(eb); } @@ -5117,7 +5550,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, goto owner_mismatch; } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); + btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, + wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -5209,10 +5643,10 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * * If called with for_reloc == 0, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { + const bool is_reloc_root = (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5224,6 +5658,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5240,7 +5675,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out; } - trans = btrfs_start_transaction(tree_root, 0); + /* + * Use join to avoid potential EINTR from transaction start. See + * wait_reserve_ticket and the whole reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -5250,9 +5692,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (err) goto out_end_trans; - if (block_rsv) - trans->block_rsv = block_rsv; - /* * This will help us catch people modifying the fs tree while we're * dropping it. It is unsafe to mess with the fs tree while it's being @@ -5262,12 +5701,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -5275,7 +5715,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, memcpy(&wc->update_progress, &key, sizeof(wc->update_progress)); - level = root_item->drop_level; + level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -5295,8 +5735,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking_write(path->nodes[level]); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, @@ -5308,7 +5747,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } BUG_ON(wc->refs[level] == 0); - if (level == root_item->drop_level) + if (level == btrfs_root_drop_level(root_item)) break; btrfs_tree_unlock(path->nodes[level]); @@ -5353,7 +5792,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } btrfs_cpu_key_to_disk(&root_item->drop_progress, &wc->drop_progress); - root_item->drop_level = wc->drop_level; + btrfs_set_root_drop_level(root_item, wc->drop_level); BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || @@ -5367,6 +5806,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, @@ -5375,13 +5817,19 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_free; } - trans = btrfs_start_transaction(tree_root, 0); + /* + * Use join to avoid potential EINTR from transaction + * start. See wait_reserve_ticket and the whole + * reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; } - if (block_rsv) - trans->block_rsv = block_rsv; } } btrfs_release_path(path); @@ -5395,7 +5843,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { @@ -5413,21 +5861,36 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + /* + * This subvolume is going to be completely dropped, and won't be + * recorded as dirty roots, thus pertrans meta rsv will not be freed at + * commit transaction time. So free it here manually. + */ + btrfs_qgroup_convert_reserved_meta(root, INT_MAX); + btrfs_qgroup_free_meta_all_pertrans(root); + + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); - } else { - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - btrfs_put_fs_root(root); - } + else + btrfs_put_root(root); root_dropped = true; out_end_trans: + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); out_free: kfree(wc); btrfs_free_path(path); out: /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we * keep trying to do the work later. This also cleans up roots if we @@ -5436,8 +5899,6 @@ out: */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); - if (err && err != -EAGAIN) - btrfs_handle_fs_error(fs_info, err, NULL); return err; } @@ -5472,17 +5933,17 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; } - btrfs_assert_tree_locked(parent); + btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); - btrfs_assert_tree_locked(node); + btrfs_assert_tree_write_locked(node); level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -5574,13 +6035,13 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { - u64 start = SZ_1M, len = 0, end = 0; + u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; /* Discard not supported = nothing to do. */ - if (!blk_queue_discard(bdev_get_queue(device->bdev))) + if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ @@ -5605,8 +6066,21 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); - /* Ensure we skip the reserved area in the first 1M */ - start = max_t(u64, start, SZ_1M); + /* Check if there are any CHUNK_* bits left */ + if (start > device->total_bytes) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_in_rcu(fs_info, +"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", + start, end - start + 1, + rcu_str_deref(device->name), + device->total_bytes); + mutex_unlock(&fs_info->chunk_mutex); + ret = 0; + break; + } + + /* Ensure we skip the reserved space on each device. */ + start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the @@ -5660,9 +6134,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; struct btrfs_device *device; - struct list_head *devices; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; @@ -5674,6 +6148,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) int dev_ret = 0; int ret = 0; + if (range->start == U64_MAX) + return -EINVAL; + /* * Check range overflow if range->len is set. * The default range->len is U64_MAX. @@ -5694,13 +6171,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { - ret = btrfs_cache_block_group(cache, 0); - if (ret) { - bg_failed++; - bg_ret = ret; - continue; - } - ret = btrfs_wait_block_group_cache_done(cache); + ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; bg_ret = ret; @@ -5726,9 +6197,12 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) btrfs_warn(fs_info, "failed to trim %llu block group(s), last error %d", bg_failed, bg_ret); - mutex_lock(&fs_info->fs_devices->device_list_mutex); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + continue; + ret = btrfs_trim_free_extents(device, &group_trimmed); if (ret) { dev_failed++; @@ -5738,7 +6212,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); if (dev_failed) btrfs_warn(fs_info, @@ -5749,47 +6223,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) return bg_ret; return dev_ret; } - -/* - * btrfs_{start,end}_write_no_snapshotting() are similar to - * mnt_{want,drop}_write(), they are used to prevent some tasks from writing - * data into the page cache through nocow before the subvolume is snapshoted, - * but flush the data into disk after the snapshot creation, or to prevent - * operations while snapshotting is ongoing and that cause the snapshot to be - * inconsistent (writes followed by expanding truncates for example). - */ -void btrfs_end_write_no_snapshotting(struct btrfs_root *root) -{ - percpu_counter_dec(&root->subv_writers->counter); - cond_wake_up(&root->subv_writers->wait); -} - -int btrfs_start_write_no_snapshotting(struct btrfs_root *root) -{ - if (atomic_read(&root->will_be_snapshotted)) - return 0; - - percpu_counter_inc(&root->subv_writers->counter); - /* - * Make sure counter is updated before we check for snapshot creation. - */ - smp_mb(); - if (atomic_read(&root->will_be_snapshotted)) { - btrfs_end_write_no_snapshotting(root); - return 0; - } - return 1; -} - -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshotting(root); - if (ret) - break; - wait_var_event(&root->will_be_snapshotted, - !atomic_read(&root->will_be_snapshotted)); - } -} |