diff options
Diffstat (limited to 'fs/btrfs/backref.c')
-rw-r--r-- | fs/btrfs/backref.c | 1531 |
1 files changed, 1314 insertions, 217 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e5d85311d5d5..18374a6d05bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -13,6 +13,8 @@ #include "transaction.h" #include "delayed-ref.h" #include "locking.h" +#include "misc.h" +#include "tree-mod-log.h" /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -136,6 +138,7 @@ struct share_check { u64 root_objectid; u64 inum; int share_count; + bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) @@ -286,8 +289,10 @@ static void prelim_release(struct preftree *preftree) struct prelim_ref *ref, *next_ref; rbtree_postorder_for_each_entry_safe(ref, next_ref, - &preftree->root.rb_root, rbnode) + &preftree->root.rb_root, rbnode) { + free_inode_elem_list(ref->inode_list); free_pref(ref); + } preftree->root = RB_ROOT_CACHED; preftree->count = 0; @@ -347,33 +352,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info, return -ENOMEM; ref->root_id = root_id; - if (key) { + if (key) ref->key_for_search = *key; - /* - * We can often find data backrefs with an offset that is too - * large (>= LLONG_MAX, maximum allowed file offset) due to - * underflows when subtracting a file's offset with the data - * offset of its corresponding extent data item. This can - * happen for example in the clone ioctl. - * So if we detect such case we set the search key's offset to - * zero to make sure we will find the matching file extent item - * at add_all_parents(), otherwise we will miss it because the - * offset taken form the backref is much larger then the offset - * of the file extent item. This can make us scan a very large - * number of file extent items, but at least it will not make - * us miss any. - * This is an ugly workaround for a behaviour that should have - * never existed, but it does and a fix for the clone ioctl - * would touch a lot of places, cause backwards incompatibility - * and would not fix the problem for extents cloned with older - * kernels. - */ - if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && - ref->key_for_search.offset >= LLONG_MAX) - ref->key_for_search.offset = 0; - } else { + else memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); - } ref->inode_list = NULL; ref->level = level; @@ -409,10 +391,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, wanted_disk_byte, count, sc, gfp_mask); } +static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) +{ + struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct prelim_ref *ref = NULL; + struct prelim_ref target = {}; + int result; + + target.parent = bytenr; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct prelim_ref, rbnode); + result = prelim_ref_compare(ref, &target); + + if (result < 0) + p = &(*p)->rb_left; + else if (result > 0) + p = &(*p)->rb_right; + else + return 1; + } + return 0; +} + static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, struct prelim_ref *ref, + struct ulist *parents, + struct preftrees *preftrees, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, - u64 total_refs, bool ignore_offset) + bool ignore_offset) { int ret = 0; int slot; @@ -424,6 +432,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, u64 disk_byte; u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; + u64 data_offset; if (level != 0) { eb = path->nodes[level]; @@ -434,18 +443,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } /* - * We normally enter this function with the path already pointing to - * the first item to check. But sometimes, we may enter it with - * slot==nritems. In that case, go to the next leaf before we continue. + * 1. We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot == nritems. + * 2. We are searching for normal backref but bytenr of this leaf + * matches shared data backref + * 3. The leaf owner is not equal to the root we are searching + * + * For these cases, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - if (time_seq == SEQ_LAST) + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb) || + is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb)) { + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); } - while (!ret && count < total_refs) { + while (!ret && count < ref->count) { eb = path->nodes[0]; slot = path->slots[0]; @@ -455,13 +472,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, key.type != BTRFS_EXTENT_DATA_KEY) break; + /* + * We are searching for normal backref but bytenr of this leaf + * matches shared data backref, OR + * the leaf owner is not equal to the root we are searching for + */ + if (slot == 0 && + (is_shared_data_backref(preftrees, eb->start) || + ref->root_id != btrfs_header_owner(eb))) { + if (time_seq == BTRFS_SEQ_LAST) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + continue; + } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + data_offset = btrfs_file_extent_offset(eb, fi); if (disk_byte == wanted_disk_byte) { eie = NULL; old = NULL; - count++; + if (ref->key_for_search.offset == key.offset - data_offset) + count++; + else + goto next; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -483,7 +518,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -502,59 +537,82 @@ next: */ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, + struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs, - bool ignore_offset) + const u64 *extent_item_pos, bool ignore_offset) { struct btrfs_root *root; - struct btrfs_key root_key; struct extent_buffer *eb; int ret = 0; int root_level; int level = ref->level; - int index; + struct btrfs_key search_key = ref->key_for_search; - root_key.objectid = ref->root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_get_fs_root(fs_info, &root_key, false); + /* + * If we're search_commit_root we could possibly be holding locks on + * other tree nodes. This happens when qgroups does backref walks when + * adding new delayed refs. To deal with this we need to look in cache + * for the root, and if we don't find it then we need to search the + * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage + * here. + */ + if (path->search_commit_root) + root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); + else + root = btrfs_get_fs_root(fs_info, ref->root_id, false); if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(root); + goto out_free; + } + + if (!path->search_commit_root && + test_bit(BTRFS_ROOT_DELETING, &root->state)) { + ret = -ENOENT; goto out; } if (btrfs_is_testing(fs_info)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; } if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == SEQ_LAST) + else if (time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); - if (root_level + 1 == level) { - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (root_level + 1 == level) goto out; - } + /* + * We can often find data backrefs with an offset that is too large + * (>= LLONG_MAX, maximum allowed file offset) due to underflows when + * subtracting a file's offset with the data offset of its + * corresponding extent data item. This can happen for example in the + * clone ioctl. + * + * So if we detect such case we set the search key's offset to zero to + * make sure we will find the matching file extent item at + * add_all_parents(), otherwise we will miss it because the offset + * taken form the backref is much larger then the offset of the file + * extent item. This can make us scan a very large number of file + * extent items, but at least it will not make us miss any. + * + * This is an ugly workaround for a behaviour that should have never + * existed, but it does and a fix for the clone ioctl would touch a lot + * of places, cause backwards incompatibility and would not fix the + * problem for extents cloned with older kernels. + */ + if (search_key.type == BTRFS_EXTENT_DATA_KEY && + search_key.offset >= LLONG_MAX) + search_key.offset = 0; path->lowest_level = level; - if (time_seq == SEQ_LAST) - ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, - 0, 0); + if (time_seq == BTRFS_SEQ_LAST) + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, - time_seq); - - /* root node has been locked, we can release @subvol_srcu safely here */ - srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = btrfs_search_old_slot(root, &search_key, path, time_seq); btrfs_debug(fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", @@ -574,9 +632,11 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos, total_refs, ignore_offset); + ret = add_all_parents(root, path, parents, preftrees, ref, level, + time_seq, extent_item_pos, ignore_offset); out: + btrfs_put_root(root); +out_free: path->lowest_level = 0; btrfs_release_path(path); return ret; @@ -590,6 +650,18 @@ unode_aux_to_inode_list(struct ulist_node *node) return (struct extent_inode_elem *)(uintptr_t)node->aux; } +static void free_leaf_list(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(ulist, &uiter))) + free_inode_elem_list(unode_aux_to_inode_list(node)); + + ulist_free(ulist); +} + /* * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not @@ -609,7 +681,7 @@ unode_aux_to_inode_list(struct ulist_node *node) static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, - const u64 *extent_item_pos, u64 total_refs, + const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { int err; @@ -653,9 +725,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos, - total_refs, ignore_offset); + err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, + ref, parents, extent_item_pos, + ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -704,7 +776,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, cond_resched(); } out: - ulist_free(parents); + /* + * We may have inode lists attached to refs in the parents ulist, so we + * must free them before freeing the ulist and its refs. + */ + free_leaf_list(parents); return ret; } @@ -727,16 +803,18 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0, - ref->level - 1, NULL); + eb = read_tree_block(fs_info, ref->wanted_disk_byte, + ref->root_id, 0, ref->level - 1, NULL); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_pref(ref); free_extent_buffer(eb); return -EIO; } + if (lock) btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) @@ -758,20 +836,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, */ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head, u64 seq, - struct preftrees *preftrees, u64 *total_refs, - struct share_check *sc) + struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key tmp_op_key; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, @@ -793,15 +865,20 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, default: BUG(); } - *total_refs += count; switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } ref = btrfs_delayed_node_to_tree_ref(node); ret = add_indirect_ref(fs_info, preftrees, ref->root, - &tmp_op_key, ref->level + 1, + key_ptr, ref->level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -827,13 +904,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, key.offset = ref->offset; /* - * Found a inum that doesn't match our known inum, we - * know it's shared. + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. */ - if (sc && sc->inum && ref->objectid != sc->inum) { - ret = BACKREF_FOUND_SHARED; - goto out; - } + if (sc && count < 0) + sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, ref->root, &key, 0, node->bytenr, count, sc, @@ -863,7 +949,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } if (!ret) ret = extent_is_shared(sc); -out: + spin_unlock(&head->lock); return ret; } @@ -876,7 +962,7 @@ out: static int add_inline_refs(const struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, int *info_level, struct preftrees *preftrees, - u64 *total_refs, struct share_check *sc) + struct share_check *sc) { int ret = 0; int slot; @@ -895,12 +981,11 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; slot = path->slots[0]; - item_size = btrfs_item_size_nr(leaf, slot); + item_size = btrfs_item_size(leaf, slot); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); - *total_refs += btrfs_extent_refs(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); @@ -967,7 +1052,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -977,6 +1063,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ret = add_indirect_ref(fs_info, preftrees, root, &key, 0, bytenr, count, sc, GFP_NOFS); + break; } default: @@ -995,12 +1082,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_fs_info *fs_info, +static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_path *path, u64 bytenr, int info_level, struct preftrees *preftrees, struct share_check *sc) { - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_fs_info *fs_info = extent_root->fs_info; int ret; int slot; struct extent_buffer *leaf; @@ -1066,7 +1153,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1094,8 +1182,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave - * much like trans == NULL case, the difference only lies in it will not + * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and + * behave much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). * @@ -1116,6 +1204,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct ulist *roots, const u64 *extent_item_pos, struct share_check *sc, bool ignore_offset) { + struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1125,8 +1214,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct prelim_ref *ref; struct rb_node *node; struct extent_inode_elem *eie = NULL; - /* total of both direct AND indirect refs! */ - u64 total_refs = 0; struct preftrees preftrees = { .direct = PREFTREE_INIT, .indirect = PREFTREE_INIT, @@ -1148,31 +1235,29 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; - /* - * grab both a lock on the path and a lock on the delayed ref head. - * We need both to get a consistent picture of how the refs look - * at a specified point in time - */ again: head = NULL; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret == 0); + if (ret == 0) { + /* This shouldn't happen, indicates a bug or fs corruption. */ + ASSERT(ret != 0); + ret = -EUCLEAN; + goto out; + } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != SEQ_LAST) { -#else - if (trans && time_seq != SEQ_LAST) { -#endif + time_seq != BTRFS_SEQ_LAST) { /* - * look if there are updates for this ref queued and lock the - * head + * We have a specific time_seq we care about and trans which + * means we have the path lock, we need to grab the ref head and + * lock it so we have a consistent view of the refs at the given + * time. */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -1195,7 +1280,7 @@ again: } spin_unlock(&delayed_refs->lock); ret = add_delayed_refs(fs_info, head, time_seq, - &preftrees, &total_refs, sc); + &preftrees, sc); mutex_unlock(&head->mutex); if (ret) goto out; @@ -1216,11 +1301,10 @@ again: (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = add_inline_refs(fs_info, path, bytenr, - &info_level, &preftrees, - &total_refs, sc); + &info_level, &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(fs_info, path, bytenr, info_level, + ret = add_keyed_refs(root, path, bytenr, info_level, &preftrees, sc); if (ret) goto out; @@ -1236,7 +1320,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, total_refs, sc, ignore_offset); + extent_item_pos, sc, ignore_offset); if (ret) goto out; @@ -1281,28 +1365,33 @@ again: struct extent_buffer *eb; eb = read_tree_block(fs_info, ref->parent, 0, - ref->level, NULL); + 0, ref->level, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; } - if (!path->skip_locking) { + if (!path->skip_locking) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); if (ret < 0) goto out; ref->inode_list = eie; + /* + * We transferred the list ownership to the ref, + * so set to NULL to avoid a double free in case + * an error happens after this. + */ + eie = NULL; } ret = ulist_add_merge_ptr(refs, ref->parent, ref->inode_list, @@ -1311,15 +1400,31 @@ again: goto out; if (!ret && extent_item_pos) { /* - * we've recorded that parent, so we must extend - * its inode list here + * We've recorded that parent, so we must extend + * its inode list here. + * + * However if there was corruption we may not + * have found an eie, return an error in this + * case. */ - BUG_ON(!eie); + ASSERT(eie); + if (!eie) { + ret = -EUCLEAN; + goto out; + } while (eie->next) eie = eie->next; eie->next = ref->inode_list; } eie = NULL; + /* + * We have transferred the inode list ownership from + * this ref to the ref we added to the 'refs' ulist. + * So set this ref's inode list to NULL to avoid + * use-after-free when our caller uses it or double + * frees in case an error happens before we return. + */ + ref->inode_list = NULL; } cond_resched(); } @@ -1336,24 +1441,6 @@ out: return ret; } -static void free_leaf_list(struct ulist *blocks) -{ - struct ulist_node *node = NULL; - struct extent_inode_elem *eie; - struct ulist_iterator uiter; - - ULIST_ITER_INIT(&uiter); - while ((node = ulist_next(blocks, &uiter))) { - if (!node->aux) - continue; - eie = unode_aux_to_inode_list(node); - free_inode_elem_list(eie); - node->aux = 0; - } - - ulist_free(blocks); -} - /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -1362,10 +1449,10 @@ static void free_leaf_list(struct ulist *blocks) * * returns 0 on success, <0 on error */ -static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) +int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **leafs, + const u64 *extent_item_pos, bool ignore_offset) { int ret; @@ -1422,6 +1509,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); + *roots = NULL; return ret; } node = ulist_next(tmp, &uiter); @@ -1438,23 +1526,150 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, - bool ignore_offset) + bool skip_commit_root_sem) { int ret; - if (!trans) + if (!trans && !skip_commit_root_sem) down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots, ignore_offset); - if (!trans) + time_seq, roots, false); + if (!trans && !skip_commit_root_sem) up_read(&fs_info->commit_root_sem); return ret; } -/** - * btrfs_check_shared - tell us whether an extent is shared +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + + if (!cache->use_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &cache->entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + cache->entries[i].is_shared = true; + cache->entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!cache->use_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(root->fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &cache->entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &cache->entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + +/* + * Check if a data extent is shared or not. * - * btrfs_check_shared uses the backref walking code but will short + * @root: The root the inode belongs to. + * @inum: Number of the inode whose extent we are checking. + * @bytenr: Logical bytenr of the extent we are checking. + * @extent_gen: Generation of the extent (file extent item) or 0 if it is + * not known. + * @roots: List of roots this extent is shared among. + * @tmp: Temporary list used for iteration. + * @cache: A backref lookup result cache. + * + * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the * one passed in. This provides a significant performance benefit for * callers (such as fiemap) which want to know whether the extent is @@ -1465,20 +1680,24 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, - struct ulist *roots, struct ulist *tmp) +int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp, + struct btrfs_backref_shared_cache *cache) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, + .have_delayed_delete_refs = false, }; + int level; ulist_init(roots); ulist_init(tmp); @@ -1495,23 +1714,73 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, btrfs_get_tree_mod_seq(fs_info, &elem); } + /* -1 means we are in the bytenr of the data extent. */ + level = -1; ULIST_ITER_INIT(&uiter); + cache->use_cache = true; while (1) { + bool is_shared; + bool cached; + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; + if (level >= 0) + store_backref_shared_cache(cache, root, bytenr, + level, true); break; } if (ret < 0 && ret != -ENOENT) break; ret = 0; + /* + * If our data extent is not shared through reflinks and it was + * created in a generation after the last one used to create a + * snapshot of the inode's root, then it can not be shared + * indirectly through subtrees, as that can only happen with + * snapshots. In this case bail out, no need to check for the + * sharedness of extent buffers. + */ + if (level == -1 && + extent_gen > btrfs_root_last_snapshot(&root->root_item)) + break; + + /* + * If our data extent was not directly shared (without multiple + * reference items), than it might have a single reference item + * with a count > 1 for the same offset, which means there are 2 + * (or more) file extent items that point to the data extent - + * this happens when a file extent item needs to be split and + * then one item gets moved to another leaf due to a b+tree leaf + * split when inserting some item. In this case the file extent + * items may be located in different leaves and therefore some + * of the leaves may be referenced through shared subtrees while + * others are not. Since our extent buffer cache only works for + * a single path (by far the most common case and simpler to + * deal with), we can not use it if we have multiple leaves + * (which implies multiple paths). + */ + if (level == -1 && tmp->nnodes > 1) + cache->use_cache = false; + + if (level >= 0) + store_backref_shared_cache(cache, root, bytenr, + level, false); node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; + level++; + cached = lookup_backref_shared_cache(cache, root, bytenr, level, + &is_shared); + if (cached) { + ret = (is_shared ? 1 : 0); + break; + } shared.share_count = 0; + shared.have_delayed_delete_refs = false; cond_resched(); } @@ -1620,13 +1889,11 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = ((s64)size) - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; - int leave_spinning = path->leave_spinning; struct btrfs_inode_ref *iref; if (bytes_left >= 0) dest[bytes_left] = '\0'; - path->leave_spinning = 1; while (1) { bytes_left -= name_len; if (bytes_left >= 0) @@ -1634,7 +1901,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, name_off, name_len); if (eb != eb_in) { if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1654,8 +1921,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - if (!path->skip_locking) - btrfs_set_lock_blocking_read(eb); path->nodes[0] = NULL; path->locks[0] = 0; } @@ -1672,7 +1937,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); - path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); @@ -1689,6 +1953,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags_ret) { + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); int ret; u64 flags; u64 size = 0; @@ -1704,11 +1969,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, key.objectid = logical; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; - ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + ret = btrfs_previous_extent_item(extent_root, path, 0); if (ret) { if (ret > 0) ret = -ENOENT; @@ -1728,7 +1993,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); + item_size = btrfs_item_size(eb, path->slots[0]); BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); @@ -1903,7 +2168,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); + struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; @@ -1911,7 +2176,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->extent_root); + trans = btrfs_attach_transaction(fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) @@ -1921,12 +2186,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, } if (trans) - btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_get_tree_mod_seq(fs_info, &seq_elem); else down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - tree_mod_seq_elem.seq, &refs, + seq_elem.seq, &refs, &extent_item_pos, ignore_offset); if (ret) goto out; @@ -1934,7 +2199,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots, + seq_elem.seq, &roots, ignore_offset); if (ret) break; @@ -1957,7 +2222,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: if (trans) { - btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_put_tree_mod_seq(fs_info, &seq_elem); btrfs_end_transaction(trans); } else { up_read(&fs_info->commit_root_sem); @@ -1966,10 +2231,29 @@ out: return ret; } +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + void *ctx, bool ignore_offset) { int ret; u64 extent_item_pos; @@ -1987,17 +2271,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx, ignore_offset); + build_ino_list, ctx, ignore_offset); return ret; } -typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx); +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, struct inode_fs_paths *ipath); -static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) { int ret = 0; int slot; @@ -2006,8 +2288,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u32 name_len; u64 parent = 0; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; - struct btrfs_item *item; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2033,18 +2316,17 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item = btrfs_item_nr(slot); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, fs_root->root_key.objectid); - ret = iterate(parent, name_len, - (unsigned long)(iref + 1), eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)(iref + 1), eb, ipath); if (ret) break; len = sizeof(*iref) + name_len; @@ -2058,15 +2340,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) { int ret; int slot; u64 offset = 0; u64 parent; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_extref *extref; u32 item_size; @@ -2092,7 +2374,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, } btrfs_release_path(path); - item_size = btrfs_item_size_nr(eb, slot); + item_size = btrfs_item_size(eb, slot); ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; @@ -2102,8 +2384,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, extref = (struct btrfs_inode_extref *)(ptr + cur_offset); parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); - ret = iterate(parent, name_len, - (unsigned long)&extref->name, eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)&extref->name, eb, ipath); if (ret) break; @@ -2120,34 +2402,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, iterate_irefs_t *iterate, - void *ctx) -{ - int ret; - int found_refs = 0; - - ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); - if (!ret) - ++found_refs; - else if (ret != -ENOENT) - return ret; - - ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); - if (ret == -ENOENT && found_refs) - return 0; - - return ret; -} - /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx) + struct extent_buffer *eb, struct inode_fs_paths *ipath) { - struct inode_fs_paths *ipath = ctx; char *fspath; char *fspath_min; int i = ipath->fspath->elem_cnt; @@ -2188,8 +2449,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, */ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { - return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, ipath); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, ipath); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; } struct btrfs_data_container *init_data_container(u32 total_bytes) @@ -2252,3 +2525,827 @@ void free_ipath(struct inode_fs_paths *ipath) kvfree(ipath->fspath); kfree(ipath); } + +struct btrfs_backref_iter *btrfs_backref_iter_alloc( + struct btrfs_fs_info *fs_info, gfp_t gfp_flag) +{ + struct btrfs_backref_iter *ret; + + ret = kzalloc(sizeof(*ret), gfp_flag); + if (!ret) + return NULL; + + ret->path = btrfs_alloc_path(); + if (!ret->path) { + kfree(ret); + return NULL; + } + + /* Current backref iterator only supports iteration in commit root */ + ret->path->search_commit_root = 1; + ret->path->skip_locking = 1; + ret->fs_info = fs_info; + + return ret; +} + +int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) +{ + struct btrfs_fs_info *fs_info = iter->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); + struct btrfs_path *path = iter->path; + struct btrfs_extent_item *ei; + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + iter->bytenr = bytenr; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (ret == 0) { + ret = -EUCLEAN; + goto release; + } + if (path->slots[0] == 0) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + ret = -EUCLEAN; + goto release; + } + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if ((key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) { + ret = -ENOENT; + goto release; + } + memcpy(&iter->cur_key, &key, sizeof(key)); + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->end_ptr = (u32)(iter->item_ptr + + btrfs_item_size(path->nodes[0], path->slots[0])); + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + + /* + * Only support iteration on tree backref yet. + * + * This is an extra precaution for non skinny-metadata, where + * EXTENT_ITEM is also used for tree blocks, that we can only use + * extent flags to determine if it's a tree block. + */ + if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) { + ret = -ENOTSUPP; + goto release; + } + iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei)); + + /* If there is no inline backref, go search for keyed backref */ + if (iter->cur_ptr >= iter->end_ptr) { + ret = btrfs_next_item(extent_root, path); + + /* No inline nor keyed ref */ + if (ret > 0) { + ret = -ENOENT; + goto release; + } + if (ret < 0) + goto release; + + btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, + path->slots[0]); + if (iter->cur_key.objectid != bytenr || + (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY && + iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) { + ret = -ENOENT; + goto release; + } + iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->item_ptr = iter->cur_ptr; + iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size( + path->nodes[0], path->slots[0])); + } + + return 0; +release: + btrfs_backref_iter_release(iter); + return ret; +} + +/* + * Go to the next backref item of current bytenr, can be either inlined or + * keyed. + * + * Caller needs to check whether it's inline ref or not by iter->cur_key. + * + * Return 0 if we get next backref without problem. + * Return >0 if there is no extra backref for this bytenr. + * Return <0 if there is something wrong happened. + */ +int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) +{ + struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct btrfs_root *extent_root; + struct btrfs_path *path = iter->path; + struct btrfs_extent_inline_ref *iref; + int ret; + u32 size; + + if (btrfs_backref_iter_is_inline_ref(iter)) { + /* We're still inside the inline refs */ + ASSERT(iter->cur_ptr < iter->end_ptr); + + if (btrfs_backref_has_tree_block_info(iter)) { + /* First tree block info */ + size = sizeof(struct btrfs_tree_block_info); + } else { + /* Use inline ref type to determine the size */ + int type; + + iref = (struct btrfs_extent_inline_ref *) + ((unsigned long)iter->cur_ptr); + type = btrfs_extent_inline_ref_type(eb, iref); + + size = btrfs_extent_inline_ref_size(type); + } + iter->cur_ptr += size; + if (iter->cur_ptr < iter->end_ptr) + return 0; + + /* All inline items iterated, fall through */ + } + + /* We're at keyed items, there is no inline item, go to the next one */ + extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr); + ret = btrfs_next_item(extent_root, iter->path); + if (ret) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]); + if (iter->cur_key.objectid != iter->bytenr || + (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY && + iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY)) + return 1; + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->cur_ptr = iter->item_ptr; + iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0], + path->slots[0]); + return 0; +} + +void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, + struct btrfs_backref_cache *cache, int is_reloc) +{ + int i; + + cache->rb_root = RB_ROOT; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); + INIT_LIST_HEAD(&cache->pending_edge); + INIT_LIST_HEAD(&cache->useless_node); + cache->fs_info = fs_info; + cache->is_reloc = is_reloc; +} + +struct btrfs_backref_node *btrfs_backref_alloc_node( + struct btrfs_backref_cache *cache, u64 bytenr, int level) +{ + struct btrfs_backref_node *node; + + ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL); + node = kzalloc(sizeof(*node), GFP_NOFS); + if (!node) + return node; + + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + node->level = level; + node->bytenr = bytenr; + + return node; +} + +struct btrfs_backref_edge *btrfs_backref_alloc_edge( + struct btrfs_backref_cache *cache) +{ + struct btrfs_backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; +} + +/* + * Drop the backref node from cache, also cleaning up all its + * upper edges and any uncached nodes in the path. + * + * This cleanup happens bottom up, thus the node should either + * be the lowest node in the cache or a detached node. + */ +void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest && !node->detached); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct btrfs_backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + btrfs_backref_free_edge(cache, edge); + + /* + * Add the node to leaf node list if no other child block + * cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, &cache->leaves); + upper->lowest = 1; + } + } + + btrfs_backref_drop_node(cache, node); +} + +/* + * Release all nodes/edges from current cache + */ +void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) +{ + struct btrfs_backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct btrfs_backref_node, list); + btrfs_backref_cleanup_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct btrfs_backref_node, lower); + btrfs_backref_cleanup_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->pending_edge)); + ASSERT(list_empty(&cache->useless_node)); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); +} + +/* + * Handle direct tree backref + * + * Direct tree backref means, the backref item shows its parent bytenr + * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined). + * + * @ref_key: The converted backref key. + * For keyed backref, it's the item key. + * For inlined backref, objectid is the bytenr, + * type is btrfs_inline_ref_type, offset is + * btrfs_inline_ref_offset. + */ +static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, + struct btrfs_key *ref_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *upper; + struct rb_node *rb_node; + + ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY); + + /* Only reloc root uses backref pointing to itself */ + if (ref_key->objectid == ref_key->offset) { + struct btrfs_root *root; + + cur->is_reloc_root = 1; + /* Only reloc backref cache cares about a specific root */ + if (cache->is_reloc) { + root = find_reloc_root(cache->fs_info, cur->bytenr); + if (!root) + return -ENOENT; + cur->root = root; + } else { + /* + * For generic purpose backref cache, reloc root node + * is useless. + */ + list_add(&cur->list, &cache->useless_node); + } + return 0; + } + + edge = btrfs_backref_alloc_edge(cache); + if (!edge) + return -ENOMEM; + + rb_node = rb_simple_search(&cache->rb_root, ref_key->offset); + if (!rb_node) { + /* Parent node not yet cached */ + upper = btrfs_backref_alloc_node(cache, ref_key->offset, + cur->level + 1); + if (!upper) { + btrfs_backref_free_edge(cache, edge); + return -ENOMEM; + } + + /* + * Backrefs for the upper level block isn't cached, add the + * block to pending list + */ + list_add_tail(&edge->list[UPPER], &cache->pending_edge); + } else { + /* Parent node already cached */ + upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER); + return 0; +} + +/* + * Handle indirect tree backref + * + * Indirect tree backref means, we only know which tree the node belongs to. + * We still need to do a tree search to find out the parents. This is for + * TREE_BLOCK_REF backref (keyed or inlined). + * + * @ref_key: The same as @ref_key in handle_direct_tree_backref() + * @tree_key: The first key of this tree block. + * @path: A clean (released) path, to avoid allocating path every time + * the function get called. + */ +static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_key *ref_key, + struct btrfs_key *tree_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_backref_node *upper; + struct btrfs_backref_node *lower; + struct btrfs_backref_edge *edge; + struct extent_buffer *eb; + struct btrfs_root *root; + struct rb_node *rb_node; + int level; + bool need_check = true; + int ret; + + root = btrfs_get_fs_root(fs_info, ref_key->offset, false); + if (IS_ERR(root)) + return PTR_ERR(root); + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + cur->cowonly = 1; + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* Tree root */ + ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); + /* + * For reloc backref cache, we may ignore reloc root. But for + * general purpose backref cache, we can't rely on + * btrfs_should_ignore_reloc_root() as it may conflict with + * current running relocation and lead to missing root. + * + * For general purpose backref cache, reloc root detection is + * completely relying on direct backref (key->offset is parent + * bytenr), thus only do such check for reloc cache. + */ + if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) { + btrfs_put_root(root); + list_add(&cur->list, &cache->useless_node); + } else { + cur->root = root; + } + return 0; + } + + level = cur->level + 1; + + /* Search the tree to find parent blocks referring to the block */ + path->search_commit_root = 1; + path->skip_locking = 1; + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + btrfs_put_root(root); + return ret; + } + if (ret > 0 && path->slots[level] > 0) + path->slots[level]--; + + eb = path->nodes[level]; + if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { + btrfs_err(fs_info, +"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", + cur->bytenr, level - 1, root->root_key.objectid, + tree_key->objectid, tree_key->type, tree_key->offset); + btrfs_put_root(root); + ret = -ENOENT; + goto out; + } + lower = cur; + + /* Add all nodes and edges in the path */ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) { + ASSERT(btrfs_root_bytenr(&root->root_item) == + lower->bytenr); + /* Same as previous should_ignore_reloc_root() call */ + if (btrfs_should_ignore_reloc_root(root) && + cache->is_reloc) { + btrfs_put_root(root); + list_add(&lower->list, &cache->useless_node); + } else { + lower->root = root; + } + break; + } + + edge = btrfs_backref_alloc_edge(cache); + if (!edge) { + btrfs_put_root(root); + ret = -ENOMEM; + goto out; + } + + eb = path->nodes[level]; + rb_node = rb_simple_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = btrfs_backref_alloc_node(cache, eb->start, + lower->level + 1); + if (!upper) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + ret = -ENOMEM; + goto out; + } + upper->owner = btrfs_header_owner(eb); + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + upper->cowonly = 1; + + /* + * If we know the block isn't shared we can avoid + * checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * Add the block to pending list if we need to check its + * backrefs, we only do this once while walking up a + * tree as we will catch anything else later on. + */ + if (!upper->checked && need_check) { + need_check = false; + list_add_tail(&edge->list[UPPER], + &cache->pending_edge); + } else { + if (upper->checked) + need_check = true; + INIT_LIST_HEAD(&edge->list[UPPER]); + } + } else { + upper = rb_entry(rb_node, struct btrfs_backref_node, + rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); + } + btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER); + + if (rb_node) { + btrfs_put_root(root); + break; + } + lower = upper; + upper = NULL; + } +out: + btrfs_release_path(path); + return ret; +} + +/* + * Add backref node @cur into @cache. + * + * NOTE: Even if the function returned 0, @cur is not yet cached as its upper + * links aren't yet bi-directional. Needs to finish such links. + * Use btrfs_backref_finish_upper_links() to finish such linkage. + * + * @path: Released path for indirect tree backref lookup + * @iter: Released backref iter for extent tree search + * @node_key: The first key of the tree block + */ +int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_backref_iter *iter, + struct btrfs_key *node_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *exist; + int ret; + + ret = btrfs_backref_iter_start(iter, cur->bytenr); + if (ret < 0) + return ret; + /* + * We skip the first btrfs_tree_block_info, as we don't use the key + * stored in it, but fetch it from the tree block + */ + if (btrfs_backref_has_tree_block_info(iter)) { + ret = btrfs_backref_iter_next(iter); + if (ret < 0) + goto out; + /* No extra backref? This means the tree block is corrupted */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * The backref was added previously when processing backref of + * type BTRFS_TREE_BLOCK_REF_KEY + */ + ASSERT(list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct btrfs_backref_edge, + list[LOWER]); + ASSERT(list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * Add the upper level block to pending list if we need check + * its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &cache->pending_edge); + } else { + exist = NULL; + } + + for (; ret == 0; ret = btrfs_backref_iter_next(iter)) { + struct extent_buffer *eb; + struct btrfs_key key; + int type; + + cond_resched(); + eb = btrfs_backref_get_eb(iter); + + key.objectid = iter->bytenr; + if (btrfs_backref_iter_is_inline_ref(iter)) { + struct btrfs_extent_inline_ref *iref; + + /* Update key for inline backref */ + iref = (struct btrfs_extent_inline_ref *) + ((unsigned long)iter->cur_ptr); + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) { + ret = -EUCLEAN; + goto out; + } + key.type = type; + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + } else { + key.type = iter->cur_key.type; + key.offset = iter->cur_key.offset; + } + + /* + * Parent node found and matches current inline ref, no need to + * rebuild this node for this inline ref + */ + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + continue; + } + + /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { + ret = handle_direct_tree_backref(cache, &key, cur); + if (ret < 0) + goto out; + continue; + } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { + ret = -EINVAL; + btrfs_print_v0_err(fs_info); + btrfs_handle_fs_error(fs_info, ret, NULL); + goto out; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + continue; + } + + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset + * means the root objectid. We need to search the tree to get + * its parent bytenr. + */ + ret = handle_indirect_tree_backref(cache, path, &key, node_key, + cur); + if (ret < 0) + goto out; + } + ret = 0; + cur->checked = 1; + WARN_ON(exist); +out: + btrfs_backref_iter_release(iter); + return ret; +} + +/* + * Finish the upwards linkage created by btrfs_backref_add_tree_node() + */ +int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *start) +{ + struct list_head *useless_node = &cache->useless_node; + struct btrfs_backref_edge *edge; + struct rb_node *rb_node; + LIST_HEAD(pending_edge); + + ASSERT(start->checked); + + /* Insert this node to cache if it's not COW-only */ + if (!start->cowonly) { + rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, + &start->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, + -EEXIST); + list_add_tail(&start->lower, &cache->leaves); + } + + /* + * Use breadth first search to iterate all related edges. + * + * The starting points are all the edges of this node + */ + list_for_each_entry(edge, &start->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &pending_edge); + + while (!list_empty(&pending_edge)) { + struct btrfs_backref_node *upper; + struct btrfs_backref_node *lower; + + edge = list_first_entry(&pending_edge, + struct btrfs_backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + lower = edge->node[LOWER]; + + /* Parent is detached, no need to keep any edges */ + if (upper->detached) { + list_del(&edge->list[LOWER]); + btrfs_backref_free_edge(cache, edge); + + /* Lower node is orphan, queue for cleanup */ + if (list_empty(&lower->upper)) + list_add(&lower->list, useless_node); + continue; + } + + /* + * All new nodes added in current build_backref_tree() haven't + * been linked to the cache rb tree. + * So if we have upper->rb_node populated, this means a cache + * hit. We only need to link the edge, as @upper and all its + * parents have already been linked. + */ + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + /* Sanity check, we shouldn't have any unchecked nodes */ + if (!upper->checked) { + ASSERT(0); + return -EUCLEAN; + } + + /* Sanity check, COW-only node has non-COW-only parent */ + if (start->cowonly != upper->cowonly) { + ASSERT(0); + return -EUCLEAN; + } + + /* Only cache non-COW-only (subvolume trees) tree blocks */ + if (!upper->cowonly) { + rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (rb_node) { + btrfs_backref_panic(cache->fs_info, + upper->bytenr, -EEXIST); + return -EUCLEAN; + } + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + + /* + * Also queue all the parent edges of this uncached node + * to finish the upper linkage + */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &pending_edge); + } + return 0; +} + +void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *lower; + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + + while (!list_empty(&cache->useless_node)) { + lower = list_first_entry(&cache->useless_node, + struct btrfs_backref_node, list); + list_del_init(&lower->list); + } + while (!list_empty(&cache->pending_edge)) { + edge = list_first_entry(&cache->pending_edge, + struct btrfs_backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + upper = edge->node[UPPER]; + btrfs_backref_free_edge(cache, edge); + + /* + * Lower is no longer linked to any upper backref nodes and + * isn't in the cache, we can free it ourselves. + */ + if (list_empty(&lower->upper) && + RB_EMPTY_NODE(&lower->rb_node)) + list_add(&lower->list, &cache->useless_node); + + if (!RB_EMPTY_NODE(&upper->rb_node)) + continue; + + /* Add this guy's upper edges to the list to process */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], + &cache->pending_edge); + if (list_empty(&upper->upper)) + list_add(&upper->list, &cache->useless_node); + } + + while (!list_empty(&cache->useless_node)) { + lower = list_first_entry(&cache->useless_node, + struct btrfs_backref_node, list); + list_del_init(&lower->list); + if (lower == node) + node = NULL; + btrfs_backref_drop_node(cache, lower); + } + + btrfs_backref_cleanup_node(cache, node); + ASSERT(list_empty(&cache->useless_node) && + list_empty(&cache->pending_edge)); +} |