From ecdcf3c259e4c36ec6c81e7a807b4924be898b20 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 22 Oct 2020 18:40:46 +0300 Subject: btrfs: open code insert_orphan_item Just open code it in its sole caller and remove a level of indirection. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 56cbc1706b6f..135cb40295c1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1564,18 +1564,6 @@ out: return ret; } -static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 ino) -{ - int ret; - - ret = btrfs_insert_orphan_item(trans, root, ino); - if (ret == -EEXIST) - ret = 0; - - return ret; -} - static int count_inode_extrefs(struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path) { @@ -1727,7 +1715,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (ret) goto out; } - ret = insert_orphan_item(trans, root, ino); + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; } out: -- cgit v1.2.3-59-g8ed1b From ac5887c8e013d6754d36e6d51dc03448ee0b0065 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:10 -0400 Subject: btrfs: locking: remove all the blocking helpers Now that we're using a rw_semaphore we no longer need to indicate if a lock is blocking or not, nor do we need to flip the entire path from blocking to spinning. Remove these helpers and all the places they are called. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 10 ++---- fs/btrfs/ctree.c | 91 +++++++----------------------------------------- fs/btrfs/delayed-inode.c | 7 ---- fs/btrfs/disk-io.c | 8 ++--- fs/btrfs/extent-tree.c | 19 ++++------ fs/btrfs/file.c | 3 +- fs/btrfs/inode.c | 1 - fs/btrfs/locking.c | 74 --------------------------------------- fs/btrfs/locking.h | 11 +----- fs/btrfs/qgroup.c | 9 ++--- fs/btrfs/ref-verify.c | 6 ++-- fs/btrfs/relocation.c | 4 --- fs/btrfs/transaction.c | 2 -- fs/btrfs/tree-defrag.c | 1 - fs/btrfs/tree-log.c | 3 -- 15 files changed, 30 insertions(+), 219 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 771a036867dc..deef23b36c5b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1341,14 +1341,12 @@ again: goto out; } - if (!path->skip_locking) { + if (!path->skip_locking) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); if (ret < 0) goto out; @@ -1685,7 +1683,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, name_off, name_len); if (eb != eb_in) { if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1705,8 +1703,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - if (!path->skip_locking) - btrfs_set_lock_blocking_read(eb); path->nodes[0] = NULL; path->locks[0] = 0; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 113da62dc17f..ac8ebdf4d886 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1278,14 +1278,11 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (!tm) return eb; - btrfs_set_path_blocking(path); - btrfs_set_lock_blocking_read(eb); - if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); return NULL; } @@ -1297,13 +1294,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } else { eb_rewin = btrfs_clone_extent_buffer(eb); if (!eb_rewin) { - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); return NULL; } } - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), @@ -1373,9 +1370,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); } else { - btrfs_set_lock_blocking_read(eb_root); eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock_blocking(eb_root); + btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); } @@ -1483,10 +1479,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, search_start = buf->start & ~((u64)SZ_1G - 1); - if (parent) - btrfs_set_lock_blocking_write(parent); - btrfs_set_lock_blocking_write(buf); - /* * Before CoWing this block for later modification, check if it's * the subtree root and do the delayed subtree trace if needed. @@ -1604,8 +1596,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems <= 1) return 0; - btrfs_set_lock_blocking_write(parent); - for (i = start_slot; i <= end_slot; i++) { struct btrfs_key first_key; int close = 1; @@ -1663,7 +1653,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); - btrfs_set_lock_blocking_write(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -1835,8 +1824,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = path->nodes[level]; - WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && - path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -1865,7 +1853,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } btrfs_tree_lock(child); - btrfs_set_lock_blocking_write(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, BTRFS_NESTING_COW); if (ret) { @@ -1904,7 +1891,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (left) { __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -1920,7 +1906,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (right) { __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); @@ -2084,7 +2069,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2139,7 +2123,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 right_nr; __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2399,14 +2382,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - btrfs_set_path_blocking(p); - /* now we're allowed to do a blocking uptodate check */ ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); if (!ret) { @@ -2426,7 +2401,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * out which blocks to read. */ btrfs_unlock_up_safe(p, level + 1); - btrfs_set_path_blocking(p); if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); @@ -2480,7 +2454,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = split_node(trans, root, p, level); @@ -2500,7 +2473,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - btrfs_set_path_blocking(p); reada_for_balance(fs_info, p, level); sret = balance_level(trans, root, p, level); @@ -2752,7 +2724,6 @@ again: goto again; } - btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, &b, @@ -2822,7 +2793,6 @@ cow_done: goto again; } - btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); @@ -2884,17 +2854,11 @@ cow_done: if (!p->skip_locking) { level = btrfs_header_level(b); if (level <= write_lock_level) { - if (!btrfs_try_tree_write_lock(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - } + btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, - p->recurse); - } + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; @@ -2902,12 +2866,6 @@ cow_done: } ret = 1; done: - /* - * we don't really know what they plan on doing with the path - * from here on, so for now just mark it as blocking - */ - if (!p->leave_spinning) - btrfs_set_path_blocking(p); if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); return ret; @@ -2999,10 +2957,7 @@ again: } level = btrfs_header_level(b); - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - } + btrfs_tree_read_lock(b); b = tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { ret = -ENOMEM; @@ -3013,8 +2968,6 @@ again: } ret = 1; done: - if (!p->leave_spinning) - btrfs_set_path_blocking(p); if (ret < 0) btrfs_release_path(p); @@ -3441,7 +3394,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); atomic_inc(&c->refs); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; return 0; } @@ -3814,7 +3767,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); if (free_space < data_size) @@ -4053,7 +4005,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root return 1; __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { @@ -4448,7 +4399,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; } - btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); if (ret) goto err; @@ -4478,8 +4428,6 @@ static noinline int split_item(struct btrfs_path *path, leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); - btrfs_set_path_blocking(path); - item = btrfs_item_nr(path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); @@ -5055,7 +5003,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_set_path_blocking(path); btrfs_clean_tree_block(leaf); btrfs_del_leaf(trans, root, path, leaf); } @@ -5077,7 +5024,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, slot = path->slots[1]; atomic_inc(&leaf->refs); - btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) @@ -5248,7 +5194,6 @@ find_next_key: */ if (slot >= nritems) { path->slots[level] = slot; - btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -5265,7 +5210,6 @@ find_next_key: ret = 0; goto out; } - btrfs_set_path_blocking(path); cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); @@ -5282,7 +5226,6 @@ out: path->keep_locks = keep_locks; if (ret == 0) { btrfs_unlock_up_safe(path, path->lowest_level + 1); - btrfs_set_path_blocking(path); memcpy(min_key, &found_key, sizeof(found_key)); } return ret; @@ -5492,7 +5435,6 @@ again: goto again; } if (!ret) { - btrfs_set_path_blocking(path); __btrfs_tree_read_lock(next, BTRFS_NESTING_RIGHT, path->recurse); @@ -5527,13 +5469,8 @@ again: } if (!path->skip_locking) { - ret = btrfs_try_tree_read_lock(next); - if (!ret) { - btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, - BTRFS_NESTING_RIGHT, - path->recurse); - } + __btrfs_tree_read_lock(next, BTRFS_NESTING_RIGHT, + path->recurse); next_rw_lock = BTRFS_READ_LOCK; } } @@ -5541,8 +5478,6 @@ again: done: unlock_up(path, 0, 1, 0, NULL); path->leave_spinning = old_spinning; - if (!old_spinning) - btrfs_set_path_blocking(path); return ret; } @@ -5564,7 +5499,6 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; @@ -5606,7 +5540,6 @@ int btrfs_previous_extent_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5aba81e16113..4b1cb4e17c03 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -740,13 +740,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, goto out; } - /* - * we need allocate some memory space, but it might cause the task - * to sleep, so we set all locked nodes in the path to blocking locks - * first. - */ - btrfs_set_path_blocking(path); - keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); if (!keys) { ret = -ENOMEM; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c70a52b44ceb..212806d59012 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -250,10 +250,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - if (need_lock) { + if (need_lock) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); @@ -282,7 +280,7 @@ out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (need_lock) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); return ret; } @@ -1013,8 +1011,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -buf->len, fs_info->dirty_metadata_batch); - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking_write(buf); clear_extent_buffer_dirty(buf); } } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a27caa47aa62..24f3bde8ce5d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4665,7 +4665,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - btrfs_set_lock_blocking_write(buf); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); @@ -5054,7 +5053,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, reada = 1; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], @@ -5114,7 +5112,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return -EIO; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); } level--; @@ -5126,7 +5123,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -5254,8 +5251,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level]) { BUG_ON(level == 0); btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, @@ -5298,8 +5294,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; } btrfs_clean_tree_block(eb); } @@ -5467,9 +5462,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -5497,8 +5491,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking_write(path->nodes[level]); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, @@ -5685,7 +5678,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5abdc7332dfc..97b5b183272f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -991,8 +991,7 @@ delete_extent_item: * write lock. */ if (!ret && replace_extent && leafs_visited == 1 && - (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || - path->locks[0] == BTRFS_WRITE_LOCK) && + path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + extent_item_size) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d1915049d7e7..205ea4e6f1ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6789,7 +6789,6 @@ next: em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; - btrfs_set_path_blocking(path); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 60e0f00b9b8f..5260660b655a 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -50,31 +50,6 @@ * */ -/* - * Mark already held read lock as blocking. Can be nested in write lock by the - * same thread. - * - * Use when there are potentially long operations ahead so other thread waiting - * on the lock will not actively spin but sleep instead. - * - * The rwlock is released and blocking reader counter is increased. - */ -void btrfs_set_lock_blocking_read(struct extent_buffer *eb) -{ -} - -/* - * Mark already held write lock as blocking. - * - * Use when there are potentially long operations ahead so other threads - * waiting on the lock will not actively spin but sleep instead. - * - * The rwlock is released and blocking writers is set. - */ -void btrfs_set_lock_blocking_write(struct extent_buffer *eb) -{ -} - /* * __btrfs_tree_read_lock - lock extent buffer for read * @eb: the eb to be locked @@ -130,17 +105,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); } -/* - * Lock extent buffer for read, optimistically expecting that there are no - * contending blocking writers. If there are, don't wait. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) -{ - return btrfs_try_tree_read_lock(eb); -} - /* * Try-lock for read. * @@ -192,18 +156,6 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) up_read(&eb->lock); } -/* - * Release read lock, previously set to blocking by a pairing call to - * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same - * thread. - * - * State of rwlock is unchanged, last reader wakes waiting threads. - */ -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) -{ - btrfs_tree_read_unlock(eb); -} - /* * __btrfs_tree_lock - lock eb for write * @eb: the eb to lock @@ -239,32 +191,6 @@ void btrfs_tree_unlock(struct extent_buffer *eb) up_write(&eb->lock); } -/* - * Set all locked nodes in the path to blocking locks. This should be done - * before scheduling - */ -void btrfs_set_path_blocking(struct btrfs_path *p) -{ - int i; - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (!p->nodes[i] || !p->locks[i]) - continue; - /* - * If we currently have a spinning reader or writer lock this - * will bump the count of blocking holders and drop the - * spinlock. - */ - if (p->locks[i] == BTRFS_READ_LOCK) { - btrfs_set_lock_blocking_read(p->nodes[i]); - p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - } else if (p->locks[i] == BTRFS_WRITE_LOCK) { - btrfs_set_lock_blocking_write(p->nodes[i]); - p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; - } - } -} - /* * This releases any locks held in the path starting at level and going all the * way up to the root. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 7c27f142f7d2..f8f2fd835582 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -13,8 +13,6 @@ #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 -#define BTRFS_WRITE_LOCK_BLOCKING 3 -#define BTRFS_READ_LOCK_BLOCKING 4 /* * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at @@ -93,12 +91,8 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); -void btrfs_set_lock_blocking_read(struct extent_buffer *eb); -void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); -int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, bool recurse); @@ -116,15 +110,12 @@ static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } #endif -void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_unlock_up_safe(struct btrfs_path *path, int level); static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { - if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + if (rw == BTRFS_WRITE_LOCK) btrfs_tree_unlock(eb); - else if (rw == BTRFS_READ_LOCK_BLOCKING) - btrfs_tree_read_unlock_blocking(eb); else if (rw == BTRFS_READ_LOCK) btrfs_tree_read_unlock(eb); else diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 87bd37b70738..9e97d3172a24 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1970,8 +1970,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + src_path->locks[cur_level] = BTRFS_READ_LOCK; } src_path->slots[cur_level] = dst_path->slots[cur_level]; @@ -2111,8 +2110,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + dst_path->locks[cur_level] = BTRFS_READ_LOCK; need_cleanup = true; } @@ -2286,8 +2284,7 @@ walk_down: path->slots[level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize, diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 14b0c0dcb08c..488bc3dd3c2b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -575,10 +575,9 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, return -EIO; } btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); path->nodes[level-1] = eb; path->slots[level-1] = 0; - path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level-1] = BTRFS_READ_LOCK; } else { ret = process_leaf(root, path, bytenr, num_bytes); if (ret) @@ -1000,11 +999,10 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) return -ENOMEM; eb = btrfs_read_lock_root_node(fs_info->extent_root); - btrfs_set_lock_blocking_read(eb); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level] = BTRFS_READ_LOCK; while (1) { /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3d4618a01ef3..0b3ccf464c3d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1196,7 +1196,6 @@ again: btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); eb = btrfs_lock_root_node(dest); - btrfs_set_lock_blocking_write(eb); level = btrfs_header_level(eb); if (level < lowest_level) { @@ -1210,7 +1209,6 @@ again: BTRFS_NESTING_COW); BUG_ON(ret); } - btrfs_set_lock_blocking_write(eb); if (next_key) { next_key->objectid = (u64)-1; @@ -1279,7 +1277,6 @@ again: BTRFS_NESTING_COW); BUG_ON(ret); } - btrfs_set_lock_blocking_write(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -2309,7 +2306,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8f70d7135497..931bef8cdd4c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1598,8 +1598,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - btrfs_set_lock_blocking_write(old); - ret = btrfs_copy_root(trans, root, old, &tmp, objectid); /* clean up in any case */ btrfs_tree_unlock(old); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index d3f28b8f4ff9..7c45d960b53c 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,7 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 135cb40295c1..eb86c632535a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2722,7 +2722,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2791,7 +2790,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2873,7 +2871,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); -- cgit v1.2.3-59-g8ed1b From 3fbaf25817f7013fad3ccf76279f0bd5719a5205 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 5 Nov 2020 10:45:20 -0500 Subject: btrfs: pass the owner_root and level to alloc_extent_buffer Now that we've plumbed all of the callers to have the owner root and the level, plumb it down into alloc_extent_buffer(). Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 ++++--- fs/btrfs/disk-io.h | 3 ++- fs/btrfs/extent-tree.c | 5 +++-- fs/btrfs/extent_io.c | 12 ++++++++---- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/reada.c | 8 +++++--- fs/btrfs/relocation.c | 3 ++- fs/btrfs/tree-log.c | 4 +++- fs/btrfs/volumes.c | 3 ++- 9 files changed, 31 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 763121380942..6abe3c8f025d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -955,11 +955,12 @@ static const struct address_space_operations btree_aops = { struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, - u64 bytenr) + u64 bytenr, u64 owner_root, + int level) { if (btrfs_is_testing(fs_info)) return alloc_test_extent_buffer(fs_info, bytenr); - return alloc_extent_buffer(fs_info, bytenr); + return alloc_extent_buffer(fs_info, bytenr, owner_root, level); } /* @@ -978,7 +979,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(buf)) return buf; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 41588babf2ed..e75ea6092942 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -47,7 +47,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, int level, struct btrfs_key *first_key); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, - u64 bytenr); + u64 bytenr, u64 owner_root, + int level); void btrfs_clean_tree_block(struct extent_buffer *buf); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 90f1ec0802bd..0d1b12e5b166 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4617,7 +4617,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) return buf; @@ -5018,7 +5018,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = find_extent_buffer(fs_info, bytenr); if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr, + root->root_key.objectid, level - 1); if (IS_ERR(next)) return PTR_ERR(next); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 973254b1f216..bd0d4d195d76 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5180,7 +5180,7 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; int num_pages; @@ -6128,19 +6128,21 @@ int try_release_extent_buffer(struct page *page) * btrfs_readahead_tree_block - attempt to readahead a child block * @fs_info: the fs_info * @bytenr: bytenr to read + * @owner_root: objectid of the root that owns this eb * @gen: generation for the uptodate check, can be 0 + * @level: level for the eb * * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a * normal uptodate check of the eb, without checking the generation. If we have * to read the block we will not block on anything. */ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 gen) + u64 bytenr, u64 owner_root, u64 gen, int level) { struct extent_buffer *eb; int ret; - eb = btrfs_find_create_tree_block(fs_info, bytenr); + eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(eb)) return; @@ -6168,5 +6170,7 @@ void btrfs_readahead_node_child(struct extent_buffer *node, int slot) { btrfs_readahead_tree_block(node->fs_info, btrfs_node_blockptr(node, slot), - btrfs_node_ptr_generation(node, slot)); + btrfs_header_owner(node), + btrfs_node_ptr_generation(node, slot), + btrfs_header_level(node) - 1); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a211e90292f8..c76697fc3120 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -182,7 +182,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u64 owner_root, int level); struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, @@ -199,7 +199,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 gen); + u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); static inline int num_extent_pages(const struct extent_buffer *eb) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b1bea0d59e13..20fd4aa48a8c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -656,12 +656,13 @@ static int reada_pick_zone(struct btrfs_device *dev) } static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb) + u64 owner_root, int level, int mirror_num, + struct extent_buffer **eb) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(buf)) return 0; @@ -749,7 +750,8 @@ static int reada_start_machine_dev(struct btrfs_device *dev) logical = re->logical; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info, logical, re->owner_root, + re->level, mirror_num, &eb); if (ret) __readahead_hook(fs_info, re, NULL, ret); else if (eb) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7ebada33502f..c5774a8e6ff7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2513,7 +2513,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - btrfs_readahead_tree_block(fs_info, block->bytenr, 0); + btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0, + block->level); } /* Get first keys */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eb86c632535a..b1f97be57bb3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2699,7 +2699,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; - next = btrfs_find_create_tree_block(fs_info, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), + *level - 1); if (IS_ERR(next)) return PTR_ERR(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 814abc0b3ff3..cf9a48e27ab0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6903,7 +6903,8 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will * overallocate but we can keep it as-is, only the first page is used. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); + sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, + root->root_key.objectid, 0); if (IS_ERR(sb)) return PTR_ERR(sb); set_extent_buffer_uptodate(sb); -- cgit v1.2.3-59-g8ed1b From 5893dfb98f257805b26e499a2d5d9190f2db7484 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Nov 2020 11:07:32 +0000 Subject: btrfs: refactor btrfs_drop_extents() to make it easier to extend There are many arguments for __btrfs_drop_extents() and its wrapper btrfs_drop_extents(), which makes it hard to add more arguments to it and requires changing every caller. I have added a couple myself back in 2014 commit 1acae57b161e ("Btrfs: faster file extent item replace operations") and therefore know firsthand that it is a bit cumbersome to add additional arguments to these functions. Since I will need to add more arguments in a subsequent bug fix, this change is preparatory work and adds a data structure that holds all the arguments, for both input and output, that are passed to this function, with some comments in the structure's definition mentioning what each field is and how it relates to other fields. Callers of this function need only to zero out the content of the structure and setup only the fields they need. This also removes the need to have both __btrfs_drop_extents() and btrfs_drop_extents(), so now we have a single function named btrfs_drop_extents() that takes a pointer to this new data structure (struct btrfs_drop_extents_args). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 63 ++++++++++++++++++--- fs/btrfs/file.c | 154 +++++++++++++++++++++++++++------------------------- fs/btrfs/inode.c | 41 +++++++++----- fs/btrfs/reflink.c | 7 ++- fs/btrfs/tree-log.c | 28 +++++++--- 5 files changed, 186 insertions(+), 107 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 461d1d52aaba..3071b0eccd82 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1233,6 +1233,58 @@ struct btrfs_replace_extent_info { int insertions; }; +/* Arguments for btrfs_drop_extents() */ +struct btrfs_drop_extents_args { + /* Input parameters */ + + /* + * If NULL, btrfs_drop_extents() will allocate and free its own path. + * If 'replace_extent' is true, this must not be NULL. Also the path + * is always released except if 'replace_extent' is true and + * btrfs_drop_extents() sets 'extent_inserted' to true, in which case + * the path is kept locked. + */ + struct btrfs_path *path; + /* Start offset of the range to drop extents from */ + u64 start; + /* End (exclusive, last byte + 1) of the range to drop extents from */ + u64 end; + /* If true drop all the extent maps in the range */ + bool drop_cache; + /* + * If true it means we want to insert a new extent after dropping all + * the extents in the range. If this is true, the 'extent_item_size' + * parameter must be set as well and the 'extent_inserted' field will + * be set to true by btrfs_drop_extents() if it could insert the new + * extent. + * Note: when this is set to true the path must not be NULL. + */ + bool replace_extent; + /* + * Used if 'replace_extent' is true. Size of the file extent item to + * insert after dropping all existing extents in the range + */ + u32 extent_item_size; + + /* Output parameters */ + + /* + * Set to the minimum between the input parameter 'end' and the end + * (exclusive, last byte + 1) of the last dropped extent. This is always + * set even if btrfs_drop_extents() returns an error. + */ + u64 drop_end; + /* + * Only set if 'replace_extent' is true. Set to true if we were able + * to insert a replacement extent after dropping all extents in the + * range, otherwise set to false by btrfs_drop_extents(). + * Also, if btrfs_drop_extents() has set this to true it means it + * returned with the path locked, otherwise if it has set this to + * false it has returned with the path released. + */ + bool extent_inserted; +}; + struct btrfs_file_private { void *filldir_buf; }; @@ -3119,16 +3171,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; -int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache, - int replace_extent, - u32 extent_item_size, - int *key_inserted); int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, int drop_cache); + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97b5b183272f..1648b6bfa2e7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -678,13 +678,9 @@ next: * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. */ -int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache, - int replace_extent, - u32 extent_item_size, - int *key_inserted) +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; @@ -694,12 +690,12 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_key new_key; struct inode *vfs_inode = &inode->vfs_inode; u64 ino = btrfs_ino(inode); - u64 search_start = start; + u64 search_start = args->start; u64 disk_bytenr = 0; u64 num_bytes = 0; u64 extent_offset = 0; u64 extent_end = 0; - u64 last_end = start; + u64 last_end = args->start; int del_nr = 0; int del_slot = 0; int extent_type; @@ -709,11 +705,25 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int update_refs; int found = 0; int leafs_visited = 0; + struct btrfs_path *path = args->path; + + args->extent_inserted = false; + + /* Must always have a path if ->replace_extent is true */ + ASSERT(!(args->replace_extent && !args->path)); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + } - if (drop_cache) - btrfs_drop_extent_cache(inode, start, end - 1, 0); + if (args->drop_cache) + btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); - if (start >= inode->disk_i_size && !replace_extent) + if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || @@ -724,7 +734,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, search_start, modify_tree); if (ret < 0) break; - if (ret > 0 && path->slots[0] > 0 && search_start == start) { + if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == ino && @@ -759,7 +769,7 @@ next_slot: path->slots[0]++; goto next_slot; } - if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) break; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -801,7 +811,7 @@ next_slot: } found = 1; - search_start = max(key.offset, start); + search_start = max(key.offset, args->start); if (recow || !modify_tree) { modify_tree = -1; btrfs_release_path(path); @@ -812,7 +822,7 @@ next_slot: * | - range to drop - | * | -------- extent -------- | */ - if (start > key.offset && end < extent_end) { + if (args->start > key.offset && args->end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; @@ -820,7 +830,7 @@ next_slot: } memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = start; + new_key.offset = args->start; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { @@ -834,15 +844,15 @@ next_slot: fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); + args->start - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - extent_offset += start - key.offset; + extent_offset += args->start - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - start); + extent_end - args->start); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { @@ -852,11 +862,11 @@ next_slot: btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, - start - extent_offset); + args->start - extent_offset); ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } - key.offset = start; + key.offset = args->start; } /* * From here on out we will have actually dropped something, so @@ -868,23 +878,24 @@ next_slot: * | ---- range to drop ----- | * | -------- extent -------- | */ - if (start <= key.offset && end < extent_end) { + if (args->start <= key.offset && args->end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = end; + new_key.offset = args->end; btrfs_set_item_key_safe(fs_info, path, &new_key); - extent_offset += end - key.offset; + extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - end); + extent_end - args->end); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, end - key.offset); + inode_sub_bytes(vfs_inode, + args->end - key.offset); break; } @@ -893,7 +904,7 @@ next_slot: * | ---- range to drop ----- | * | -------- extent -------- | */ - if (start > key.offset && end >= extent_end) { + if (args->start > key.offset && args->end >= extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; @@ -901,11 +912,12 @@ next_slot: } btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); + args->start - key.offset); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, extent_end - start); - if (end == extent_end) + inode_sub_bytes(vfs_inode, + extent_end - args->start); + if (args->end == extent_end) break; path->slots[0]++; @@ -916,7 +928,7 @@ next_slot: * | ---- range to drop ----- | * | ------ extent ------ | */ - if (start <= key.offset && end >= extent_end) { + if (args->start <= key.offset && args->end >= extent_end) { delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; @@ -946,7 +958,7 @@ delete_extent_item: extent_end - key.offset); } - if (end == extent_end) + if (args->end == extent_end) break; if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { @@ -976,7 +988,7 @@ delete_extent_item: * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted - * path->slots[0] for our insertion (if replace_extent != 0). + * path->slots[0] for our insertion (if args->replace_extent). */ path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); @@ -990,14 +1002,14 @@ delete_extent_item: * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ - if (!ret && replace_extent && leafs_visited == 1 && + if (!ret && args->replace_extent && leafs_visited == 1 && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= - sizeof(struct btrfs_item) + extent_item_size) { + sizeof(struct btrfs_item) + args->extent_item_size) { key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; + key.offset = args->start; if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { struct btrfs_key slot_key; @@ -1005,30 +1017,18 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, &extent_item_size, 1); - *key_inserted = 1; + setup_items_for_insert(root, path, &key, + &args->extent_item_size, 1); + args->extent_inserted = true; } - if (!replace_extent || !(*key_inserted)) + if (!args->path) + btrfs_free_path(path); + else if (!args->extent_inserted) btrfs_release_path(path); - if (drop_end) - *drop_end = found ? min(end, last_end) : end; - return ret; -} - -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, int drop_cache) -{ - struct btrfs_path *path; - int ret; +out: + args->drop_end = found ? min(args->end, last_end) : args->end; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start, - end, NULL, drop_cache, 0, 0, NULL); - btrfs_free_path(path); return ret; } @@ -2607,6 +2607,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); @@ -2615,7 +2616,6 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, struct btrfs_block_rsv *rsv; unsigned int rsv_count; u64 cur_offset; - u64 drop_end; u64 len = end - start; int ret = 0; @@ -2654,10 +2654,12 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = rsv; cur_offset = start; + drop_args.path = path; + drop_args.end = end + 1; + drop_args.drop_cache = true; while (cur_offset < end) { - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, - cur_offset, end + 1, &drop_end, - 1, 0, 0, NULL); + drop_args.start = cur_offset; + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); if (ret != -ENOSPC) { /* * When cloning we want to avoid transaction aborts when @@ -2674,10 +2676,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = &fs_info->trans_block_rsv; - if (!extent_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); + cur_offset, drop_args.drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2688,7 +2690,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!extent_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_args.drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2696,7 +2698,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * file extent is inserted here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_end - cur_offset); + cur_offset, + drop_args.drop_end - cur_offset); if (ret) { /* * We couldn't clear our area, so we could @@ -2708,8 +2711,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } } - if (extent_info && drop_end > extent_info->file_offset) { - u64 replace_len = drop_end - extent_info->file_offset; + if (extent_info && + drop_args.drop_end > extent_info->file_offset) { + u64 replace_len = drop_args.drop_end - + extent_info->file_offset; ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len); @@ -2722,7 +2727,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset += replace_len; } - cur_offset = drop_end; + cur_offset = drop_args.drop_end; ret = btrfs_update_inode(trans, root, inode); if (ret) @@ -2781,25 +2786,26 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * will not record the existence of the hole region * [existing_hole_start, lockend]. */ - if (drop_end <= end) - drop_end = end + 1; + if (drop_args.drop_end <= end) + drop_args.drop_end = end + 1; /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && + cur_offset < drop_args.drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); + cur_offset, drop_args.drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!extent_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_end - cur_offset); + cur_offset, drop_args.drop_end - cur_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2ef8308ad0a..25764de68b92 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -202,7 +202,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_path *path, int extent_inserted, + struct btrfs_path *path, bool extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, @@ -316,6 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, int compress_type, struct page **compressed_pages) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; @@ -326,8 +327,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, u64 data_len = inline_len; int ret; struct btrfs_path *path; - int extent_inserted = 0; - u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -353,16 +352,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, } trans->block_rsv = &inode->block_rsv; + drop_args.path = path; + drop_args.start = start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + drop_args.replace_extent = true; + if (compressed_size && compressed_pages) - extent_item_size = btrfs_file_extent_calc_inline_size( + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( compressed_size); else - extent_item_size = btrfs_file_extent_calc_inline_size( + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); - ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end, - NULL, 1, 1, extent_item_size, - &extent_inserted); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -370,7 +373,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, path, extent_inserted, + ret = insert_inline_extent(trans, path, drop_args.extent_inserted, root, &inode->vfs_inode, start, inline_len, compressed_size, compress_type, compressed_pages); @@ -2568,7 +2571,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); - int extent_inserted = 0; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; path = btrfs_alloc_path(); @@ -2584,13 +2587,16 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, - file_pos + num_bytes, NULL, 0, - 1, sizeof(*stack_fi), &extent_inserted); + drop_args.path = path; + drop_args.start = file_pos; + drop_args.end = file_pos + num_bytes; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*stack_fi); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; - if (!extent_inserted) { + if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; @@ -4748,6 +4754,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; /* @@ -4770,7 +4777,11 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + drop_args.start = offset; + drop_args.end = offset + len; + drop_args.drop_cache = true; + + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5d0bb7c3dc33..67728ea3ed47 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -163,6 +163,7 @@ static int clone_copy_inline_extent(struct inode *dst, const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; struct btrfs_key key; @@ -252,7 +253,11 @@ copy_inline_extent: trans = NULL; goto out; } - ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + drop_args.path = path; + drop_args.start = drop_start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b1f97be57bb3..89ff063cae24 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -576,6 +576,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; @@ -653,7 +654,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); + drop_args.start = start; + drop_args.end = extent_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); if (ret) goto out; @@ -2576,6 +2580,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, * those prealloc extents just after replaying them. */ if (S_ISREG(mode)) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct inode *inode; u64 from; @@ -2586,8 +2591,12 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } from = ALIGN(i_size_read(inode), root->fs_info->sectorsize); - ret = btrfs_drop_extents(wc->trans, root, inode, - from, (u64)-1, 1); + drop_args.start = from; + drop_args.end = (u64)-1; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(wc->trans, root, + BTRFS_I(inode), + &drop_args); if (!ret) { /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, @@ -4185,6 +4194,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; @@ -4193,19 +4203,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; - int extent_inserted = 0; ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0, 1, - sizeof(*fi), &extent_inserted); + drop_args.path = path; + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); if (ret) return ret; - if (!extent_inserted) { + if (!drop_args.extent_inserted) { key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; -- cgit v1.2.3-59-g8ed1b From 2766ff61762c3fa19bf30bc0ff72ea5306229f09 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Nov 2020 11:07:34 +0000 Subject: btrfs: update the number of bytes used by an inode atomically There are several occasions where we do not update the inode's number of used bytes atomically, resulting in a concurrent stat(2) syscall to report a value of used blocks that does not correspond to a valid value, that is, a value that does not match neither what we had before the operation nor what we get after the operation completes. In extreme cases it can result in stat(2) reporting zero used blocks, which can cause problems for some userspace tools where they can consider a file with a non-zero size and zero used blocks as completely sparse and skip reading data, as reported/discussed a long time ago in some threads like the following: https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html The cases where this can happen are the following: -> Case 1 If we do a write (buffered or direct IO) against a file region for which there is already an allocated extent (or multiple extents), then we have a short time window where we can report a number of used blocks to stat(2) that does not take into account the file region being overwritten. This short time window happens when completing the ordered extent(s). This happens because when we drop the extents in the write range we decrement the inode's number of bytes and later on when we insert the new extent(s) we increment the number of bytes in the inode, resulting in a short time window where a stat(2) syscall can get an incorrect number of used blocks. If we do writes that overwrite an entire file, then we have a short time window where we report 0 used blocks to stat(2). Example reproducer: $ cat reproducer-1.sh #!/bin/bash MNT=/mnt/sdi DEV=/dev/sdi stat_loop() { trap "wait; exit" SIGTERM local filepath=$1 local expected=$2 local got while :; do got=$(stat -c %b $filepath) if [ $got -ne $expected ]; then echo -n "ERROR: unexpected used blocks" echo " (got: $got expected: $expected)" fi done } mkfs.btrfs -f $DEV > /dev/null # mkfs.xfs -f $DEV > /dev/null # mkfs.ext4 -F $DEV > /dev/null # mkfs.f2fs -f $DEV > /dev/null # mkfs.reiserfs -f $DEV > /dev/null mount $DEV $MNT xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null expected=$(stat -c %b $MNT/foobar) # Create a process to keep calling stat(2) on the file and see if the # reported number of blocks used (disk space used) changes, it should # not because we are not increasing the file size nor punching holes. stat_loop $MNT/foobar $expected & loop_pid=$! for ((i = 0; i < 50000; i++)); do xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null done kill $loop_pid &> /dev/null wait umount $DEV $ ./reproducer-1.sh ERROR: unexpected used blocks (got: 0 expected: 128) ERROR: unexpected used blocks (got: 0 expected: 128) (...) Note that since this is a short time window where the race can happen, the reproducer may not be able to always trigger the bug in one run, or it may trigger it multiple times. -> Case 2 If we do a buffered write against a file region that does not have any allocated extents, like a hole or beyond EOF, then during ordered extent completion we have a short time window where a concurrent stat(2) syscall can report a number of used blocks that does not correspond to the value before or after the write operation, a value that is actually larger than the value after the write completes. This happens because once we start a buffered write into an unallocated file range we increment the inode's 'new_delalloc_bytes', to make sure any stat(2) call gets a correct used blocks value before delalloc is flushed and completes. However at ordered extent completion, after we inserted the new extent, we increment the inode's number of bytes used with the size of the new extent, and only later, when clearing the range in the inode's iotree, we decrement the inode's 'new_delalloc_bytes' counter with the size of the extent. So this results in a short time window where a concurrent stat(2) syscall can report a number of used blocks that accounts for the new extent twice. Example reproducer: $ cat reproducer-2.sh #!/bin/bash MNT=/mnt/sdi DEV=/dev/sdi stat_loop() { trap "wait; exit" SIGTERM local filepath=$1 local expected=$2 local got while :; do got=$(stat -c %b $filepath) if [ $got -ne $expected ]; then echo -n "ERROR: unexpected used blocks" echo " (got: $got expected: $expected)" fi done } mkfs.btrfs -f $DEV > /dev/null # mkfs.xfs -f $DEV > /dev/null # mkfs.ext4 -F $DEV > /dev/null # mkfs.f2fs -f $DEV > /dev/null # mkfs.reiserfs -f $DEV > /dev/null mount $DEV $MNT touch $MNT/foobar write_size=$((64 * 1024)) for ((i = 0; i < 16384; i++)); do offset=$(($i * $write_size)) xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null blocks_used=$(stat -c %b $MNT/foobar) # Fsync the file to trigger writeback and keep calling stat(2) on it # to see if the number of blocks used changes. stat_loop $MNT/foobar $blocks_used & loop_pid=$! xfs_io -c "fsync" $MNT/foobar kill $loop_pid &> /dev/null wait $loop_pid done umount $DEV $ ./reproducer-2.sh ERROR: unexpected used blocks (got: 265472 expected: 265344) ERROR: unexpected used blocks (got: 284032 expected: 283904) (...) Note that since this is a short time window where the race can happen, the reproducer may not be able to always trigger the bug in one run, or it may trigger it multiple times. -> Case 3 Another case where such problems happen is during other operations that replace extents in a file range with other extents. Those operations are extent cloning, deduplication and fallocate's zero range operation. The cause of the problem is similar to the first case. When we drop the extents from a range, we decrement the inode's number of bytes, and later on, after inserting the new extents we increment it. Since this is not done atomically, a concurrent stat(2) call can see and return a number of used blocks that is smaller than it should be, does not match the number of used blocks before or after the clone/deduplication/zero operation. Like for the first case, when doing a clone, deduplication or zero range operation against an entire file, we end up having a time window where we can report 0 used blocks to a stat(2) call. Example reproducer: $ cat reproducer-3.sh #!/bin/bash MNT=/mnt/sdi DEV=/dev/sdi mkfs.btrfs -f $DEV > /dev/null # mkfs.xfs -f -m reflink=1 $DEV > /dev/null mount $DEV $MNT extent_size=$((64 * 1024)) num_extents=16384 file_size=$(($extent_size * $num_extents)) # File foo has many small extents. xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \ > /dev/null # File bar has much less extents and has exactly the same data as foo. xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null expected=$(stat -c %b $MNT/foo) # Now deduplicate bar into foo. While the deduplication is in progres, # the number of used blocks/file size reported by stat should not change xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null & dedupe_pid=$! while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do used=$(stat -c %b $MNT/foo) if [ $used -ne $expected ]; then echo "Unexpected blocks used: $used (expected: $expected)" fi done umount $DEV $ ./reproducer-3.sh Unexpected blocks used: 2076800 (expected: 2097152) Unexpected blocks used: 2097024 (expected: 2097152) Unexpected blocks used: 2079872 (expected: 2097152) (...) Note that since this is a short time window where the race can happen, the reproducer may not be able to always trigger the bug in one run, or it may trigger it multiple times. So fix this by: 1) Making btrfs_drop_extents() not decrement the VFS inode's number of bytes, and instead return the number of bytes; 2) Making any code that drops extents and adds new extents update the inode's number of bytes atomically, while holding the btrfs inode's spinlock, which is also used by the stat(2) callback to get the inode's number of bytes; 3) For ranges in the inode's iotree that are marked as 'delalloc new', corresponding to previously unallocated ranges, increment the inode's number of bytes when clearing the 'delalloc new' bit from the range, in the same critical section that decrements the inode's 'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock. An alternative would be to have btrfs_getattr() wait for any IO (ordered extents in progress) and locking the whole range (0 to (u64)-1) while it it computes the number of blocks used. But that would mean blocking stat(2), which is a very used syscall and expected to be fast, waiting for writes, clone/dedupe, fallocate, page reads, fiemap, etc. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +- fs/btrfs/ctree.h | 8 +++ fs/btrfs/extent-io-tree.h | 16 +++++- fs/btrfs/extent_io.c | 10 ++-- fs/btrfs/file.c | 43 +++++++++----- fs/btrfs/inode.c | 140 +++++++++++++++++++++++++++++++++++++--------- fs/btrfs/reflink.c | 2 +- fs/btrfs/tree-log.c | 4 +- 8 files changed, 176 insertions(+), 50 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8494f62f8aa4..b4c09a12659c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -50,7 +50,8 @@ struct btrfs_inode { /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, - * logged_trans). + * logged_trans), to access/update new_delalloc_bytes and to update the + * VFS' inode number of bytes used. */ spinlock_t lock; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3071b0eccd82..228c5df4b17f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1274,6 +1274,11 @@ struct btrfs_drop_extents_args { * set even if btrfs_drop_extents() returns an error. */ u64 drop_end; + /* + * The number of allocated bytes found in the range. This can be smaller + * than the range's length when there are holes in the range. + */ + u64 bytes_found; /* * Only set if 'replace_extent' is true. Set to true if we were able * to insert a replacement extent after dropping all extents in the @@ -3142,6 +3147,9 @@ extern const struct iomap_dio_ops btrfs_dio_ops; int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, + const u64 add_bytes, + const u64 del_bytes); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index cab4273ff8d3..5334b3772f18 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -21,10 +21,24 @@ struct io_failure_record; #define EXTENT_NORESERVE (1U << 11) #define EXTENT_QGROUP_RESERVED (1U << 12) #define EXTENT_CLEAR_DATA_RESV (1U << 13) +/* + * Must be cleared only during ordered extent completion or on error paths if we + * did not manage to submit bios and create the ordered extents for the range. + * Should not be cleared during page release and page invalidation (if there is + * an ordered extent in flight), that is left for the ordered extent completion. + */ #define EXTENT_DELALLOC_NEW (1U << 14) +/* + * When an ordered extent successfully completes for a region marked as a new + * delalloc range, use this flag when clearing a new delalloc range to indicate + * that the VFS' inode number of bytes should be incremented and the inode's new + * delalloc bytes decremented, in an atomic way to prevent races with stat(2). + */ +#define EXTENT_ADD_INODE_BYTES (1U << 15) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \ + EXTENT_ADD_INODE_BYTES) /* * Redefined bits above which are used only in the device allocation tree, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ecca6d6ec90a..3bbb3bdd395b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4423,12 +4423,14 @@ static int try_release_extent_state(struct extent_io_tree *tree, ret = 0; } else { /* - * at this point we can safely clear everything except the - * locked bit and the nodatasum bit + * At this point we can safely clear everything except the + * locked bit, the nodatasum bit and the delalloc new bit. + * The delalloc new bit will be cleared by ordered extent + * completion. */ ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask, NULL); + ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), + 0, 0, NULL, mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1648b6bfa2e7..8a9056b6e2ad 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -677,6 +677,12 @@ next: * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. + * + * Note: the VFS' inode number of bytes is not updated, it's up to the caller + * to deal with that. We set the field 'bytes_found' of the arguments structure + * with the number of allocated bytes found in the target range, so that the + * caller can update the inode's number of bytes in an atomic way when + * replacing extents in a range to avoid races with stat(2). */ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -688,7 +694,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; - struct inode *vfs_inode = &inode->vfs_inode; u64 ino = btrfs_ino(inode); u64 search_start = args->start; u64 disk_bytenr = 0; @@ -707,6 +712,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int leafs_visited = 0; struct btrfs_path *path = args->path; + args->bytes_found = 0; args->extent_inserted = false; /* Must always have a path if ->replace_extent is true */ @@ -894,8 +900,7 @@ next_slot: extent_end - args->end); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, - args->end - key.offset); + args->bytes_found += args->end - key.offset; break; } @@ -915,8 +920,7 @@ next_slot: args->start - key.offset); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, - extent_end - args->start); + args->bytes_found += extent_end - args->start; if (args->end == extent_end) break; @@ -940,8 +944,7 @@ delete_extent_item: if (update_refs && extent_type == BTRFS_FILE_EXTENT_INLINE) { - inode_sub_bytes(vfs_inode, - extent_end - key.offset); + args->bytes_found += extent_end - key.offset; extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { @@ -954,8 +957,7 @@ delete_extent_item: key.offset - extent_offset); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ - inode_sub_bytes(vfs_inode, - extent_end - key.offset); + args->bytes_found += extent_end - key.offset; } if (args->end == extent_end) @@ -2517,7 +2519,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, - const u64 replace_len) + const u64 replace_len, + const u64 bytes_to_drop) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2532,8 +2535,10 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, return 0; if (extent_info->disk_offset == 0 && - btrfs_fs_incompat(fs_info, NO_HOLES)) + btrfs_fs_incompat(fs_info, NO_HOLES)) { + btrfs_update_inode_bytes(BTRFS_I(inode), 0, bytes_to_drop); return 0; + } key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; @@ -2562,10 +2567,12 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, return ret; /* If it's a hole, nothing more needs to be done. */ - if (extent_info->disk_offset == 0) + if (extent_info->disk_offset == 0) { + btrfs_update_inode_bytes(BTRFS_I(inode), 0, bytes_to_drop); return 0; + } - inode_add_bytes(inode, replace_len); + btrfs_update_inode_bytes(BTRFS_I(inode), replace_len, bytes_to_drop); if (extent_info->is_new_extent && extent_info->insertions == 0) { key.objectid = extent_info->disk_offset; @@ -2660,6 +2667,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, while (cur_offset < end) { drop_args.start = cur_offset; ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + /* If we are punching a hole decrement the inode's byte count */ + if (!extent_info) + btrfs_update_inode_bytes(BTRFS_I(inode), 0, + drop_args.bytes_found); if (ret != -ENOSPC) { /* * When cloning we want to avoid transaction aborts when @@ -2717,7 +2728,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset; ret = btrfs_insert_replace_extent(trans, inode, path, - extent_info, replace_len); + extent_info, replace_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -2814,7 +2826,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, - extent_info->data_len); + extent_info->data_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 25764de68b92..2db11ab4ecbf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -223,8 +223,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, if (compressed_size && compressed_pages) cur_size = compressed_size; - inode_add_bytes(inode, size); - if (!extent_inserted) { struct btrfs_key key; size_t datasize; @@ -299,8 +297,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; - ret = btrfs_update_inode(trans, root, inode); - fail: return ret; } @@ -385,6 +381,16 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } + btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); + ret = btrfs_update_inode(trans, root, &inode->vfs_inode); + if (ret && ret != -ENOSPC) { + btrfs_abort_transaction(trans, ret); + goto out; + } else if (ret == -ENOSPC) { + ret = 1; + goto out; + } + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: @@ -2144,6 +2150,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; + if (*bits & EXTENT_ADD_INODE_BYTES) + inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } } @@ -2561,9 +2569,11 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, + const bool update_inode_bytes, u64 qgroup_reserved) { struct btrfs_root *root = inode->root; + const u64 sectorsize = root->fs_info->sectorsize; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; @@ -2615,7 +2625,24 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_add_bytes(&inode->vfs_inode, num_bytes); + /* + * If we dropped an inline extent here, we know the range where it is + * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the + * number of bytes only for that range contaning the inline extent. + * The remaining of the range will be processed when clearning the + * EXTENT_DELALLOC_BIT bit through the ordered extent completion. + */ + if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { + u64 inline_size = round_down(drop_args.bytes_found, sectorsize); + + inline_size = drop_args.bytes_found - inline_size; + btrfs_update_inode_bytes(inode, sectorsize, inline_size); + drop_args.bytes_found -= inline_size; + num_bytes -= sectorsize; + } + + if (update_inode_bytes) + btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; @@ -2653,6 +2680,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_file_extent_item stack_fi; u64 logical_len; + bool update_inode_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); @@ -2668,9 +2696,18 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ + /* + * For delalloc, when completing an ordered extent we update the inode's + * bytes when clearing the range in the inode's io tree, so pass false + * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), + * except if the ordered extent was truncated. + */ + update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), oe->file_offset, &stack_fi, - oe->qgroup_rsv); + update_inode_bytes, oe->qgroup_rsv); } /* @@ -2692,10 +2729,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; - bool range_locked = false; - bool clear_new_delalloc_bytes = false; bool clear_reserved_extent = true; - unsigned int clear_bits; + unsigned int clear_bits = EXTENT_DEFRAG; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; @@ -2703,7 +2738,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) - clear_new_delalloc_bytes = true; + clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); @@ -2742,7 +2777,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - range_locked = true; + clear_bits |= EXTENT_LOCKED; lock_extent_bits(io_tree, start, end, &cached_state); if (freespace_inode) @@ -2789,6 +2824,17 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + /* + * If this is a new delalloc range, clear its new delalloc flag to + * update the inode's number of bytes. This needs to be done first + * before updating the inode item. + */ + if ((clear_bits & EXTENT_DELALLOC_NEW) && + !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + 0, 0, &cached_state); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ @@ -2797,11 +2843,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } ret = 0; out: - clear_bits = EXTENT_DEFRAG; - if (range_locked) - clear_bits |= EXTENT_LOCKED; - if (clear_new_delalloc_bytes) - clear_bits |= EXTENT_DELALLOC_NEW; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); @@ -4790,10 +4831,12 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), offset, 0, 0, len, 0, len, 0, 0, 0); - if (ret) + if (ret) { btrfs_abort_transaction(trans, ret); - else + } else { + btrfs_update_inode_bytes(BTRFS_I(inode), 0, drop_args.bytes_found); btrfs_update_inode(trans, root, inode); + } btrfs_end_transaction(trans); return ret; } @@ -8117,6 +8160,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, u64 start; u64 end; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + bool found_ordered = false; + bool completed_ordered = false; /* * we have the page locked, so new writeback can't start, @@ -8138,15 +8183,17 @@ again: start = page_start; ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { + found_ordered = true; end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); /* - * IO on this page will never be started, so we need - * to account for any ordered extents now + * IO on this page will never be started, so we need to account + * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW + * here, must leave that up for the ordered extent completion. */ if (!inode_evicting) clear_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); /* @@ -8168,8 +8215,10 @@ again: if (btrfs_dec_test_ordered_pending(inode, &ordered, start, - end - start + 1, 1)) + end - start + 1, 1)) { btrfs_finish_ordered_io(ordered); + completed_ordered = true; + } } btrfs_put_ordered_extent(ordered); if (!inode_evicting) { @@ -8198,10 +8247,23 @@ again: */ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { + bool delete = true; + + /* + * If there's an ordered extent for this range and we have not + * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set + * in the range for the ordered extent completion. We must also + * not delete the range, otherwise we would lose that bit (and + * any other bits set in the range). Make sure EXTENT_UPTODATE + * is cleared if we don't delete, otherwise it can lead to + * corruptions if the i_size is extented later. + */ + if (found_ordered && !completed_ordered) + delete = false; clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state); + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete, &cached_state); __btrfs_releasepage(page, GFP_NOFS); } @@ -8750,6 +8812,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; + u64 inode_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; @@ -8776,8 +8839,9 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; + inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); - stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + stat->blocks = (ALIGN(inode_bytes, blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } @@ -9586,7 +9650,8 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( if (trans) { ret = insert_reserved_file_extent(trans, BTRFS_I(inode), - file_offset, &stack_fi, ret); + file_offset, &stack_fi, + true, ret); if (ret) return ERR_PTR(ret); return trans; @@ -10202,6 +10267,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } #endif +/* + * Update the number of bytes used in the VFS' inode. When we replace extents in + * a range (clone, dedupe, fallocate's zero range), we must update the number of + * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls + * always get a correct value. + */ +void btrfs_update_inode_bytes(struct btrfs_inode *inode, + const u64 add_bytes, + const u64 del_bytes) +{ + if (add_bytes == del_bytes) + return; + + spin_lock(&inode->lock); + if (del_bytes > 0) + inode_sub_bytes(&inode->vfs_inode, del_bytes); + if (add_bytes > 0) + inode_add_bytes(&inode->vfs_inode, add_bytes); + spin_unlock(&inode->lock); +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 67728ea3ed47..4bbc5f52b752 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -268,7 +268,7 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - inode_add_bytes(dst, datal); + btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 89ff063cae24..932a74a236eb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -832,8 +832,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret) goto out; - inode_add_bytes(inode, nbytes); update_inode: + btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, inode); out: if (inode) @@ -2598,6 +2598,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, BTRFS_I(inode), &drop_args); if (!ret) { + inode_sub_bytes(inode, + drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, root, inode); -- cgit v1.2.3-59-g8ed1b From 507433985caf668ca6205570c84520913455966c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:55 +0200 Subject: btrfs: make btrfs_truncate_inode_items take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 43 +++++++++++++++++++++++-------------------- fs/btrfs/tree-log.c | 5 ++--- 4 files changed, 27 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 560cbc46e31d..ccc2339c6568 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3072,7 +3072,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, u64 new_size, + struct btrfs_inode *inode, u64 new_size, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 58bd2d3e54db..d8010df20b58 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -273,7 +273,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ - ret = btrfs_truncate_inode_items(trans, root, inode, + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 0, BTRFS_EXTENT_DATA_KEY); if (ret) goto fail; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d44e63160b4..3dd0b472501e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4299,7 +4299,7 @@ out: */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, + struct btrfs_inode *inode, u64 new_size, u32 min_type) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4320,7 +4320,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_slot = 0; int extent_type = -1; int ret; - u64 ino = btrfs_ino(BTRFS_I(inode)); + u64 ino = btrfs_ino(inode); u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; @@ -4334,7 +4334,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * off from time to time. This means all inodes in subvolume roots, * reloc roots, and data reloc roots. */ - if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && + if (!btrfs_is_free_space_inode(inode) && test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; @@ -4344,7 +4344,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* @@ -4352,7 +4352,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * new size is not block aligned since we will be keeping the * last block of the extent just the way it is. */ - btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, + btrfs_drop_extent_cache(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); } @@ -4363,8 +4363,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ - if (min_type == 0 && root == BTRFS_I(inode)->root) - btrfs_kill_delayed_inode_items(BTRFS_I(inode)); + if (min_type == 0 && root == inode->root) + btrfs_kill_delayed_inode_items(inode); key.objectid = ino; key.offset = (u64)-1; @@ -4420,14 +4420,13 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); trace_btrfs_truncate_show_fi_regular( - BTRFS_I(inode), leaf, fi, - found_key.offset); + inode, leaf, fi, found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_ram_bytes(leaf, fi); trace_btrfs_truncate_show_fi_inline( - BTRFS_I(inode), leaf, fi, path->slots[0], + inode, leaf, fi, path->slots[0], found_key.offset); } item_end--; @@ -4466,7 +4465,8 @@ search_again: if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && extent_start != 0) - inode_sub_bytes(inode, num_dec); + inode_sub_bytes(&inode->vfs_inode, + num_dec); btrfs_mark_buffer_dirty(leaf); } else { extent_num_bytes = @@ -4481,7 +4481,8 @@ search_again: found_extent = 1; if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(inode, num_dec); + inode_sub_bytes(&inode->vfs_inode, + num_dec); } } clear_len = num_dec; @@ -4516,7 +4517,8 @@ search_again: } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(inode, item_end + 1 - new_size); + inode_sub_bytes(&inode->vfs_inode, + item_end + 1 - new_size); } delete: /* @@ -4524,8 +4526,8 @@ delete: * multiple fsyncs, and in this case we don't want to clear the * file extent range because it's just the log. */ - if (root == BTRFS_I(inode)->root) { - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + if (root == inode->root) { + ret = btrfs_inode_clear_file_extent_range(inode, clear_start, clear_len); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4633,9 +4635,9 @@ out: ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), last_size); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, - (u64)-1, &cached_state); + btrfs_inode_safe_disk_i_size_write(inode, last_size); + unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, + &cached_state); } btrfs_free_path(path); @@ -5268,7 +5270,8 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), + 0, 0); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -8512,7 +8515,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) trans->block_rsv = rsv; while (1) { - ret = btrfs_truncate_inode_items(trans, root, inode, + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, BTRFS_EXTENT_DATA_KEY); trans->block_rsv = &fs_info->trans_block_rsv; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 932a74a236eb..360732a93980 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4378,8 +4378,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, do { ret = btrfs_truncate_inode_items(trans, root->log_root, - &inode->vfs_inode, - truncate_offset, + inode, truncate_offset, BTRFS_EXTENT_DATA_KEY); } while (ret == -EAGAIN); if (ret) @@ -5306,7 +5305,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags); while(1) { ret = btrfs_truncate_inode_items(trans, - log, &inode->vfs_inode, 0, 0); + log, inode, 0, 0); if (ret != -EAGAIN) break; } -- cgit v1.2.3-59-g8ed1b From 9a56fcd15a9c6b580a21c439deab60bb4cd2cfd9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Nov 2020 16:48:59 +0200 Subject: btrfs: make btrfs_update_inode take btrfs_inode Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/ctree.h | 3 +-- fs/btrfs/file.c | 8 +++---- fs/btrfs/free-space-cache.c | 6 ++--- fs/btrfs/inode-map.c | 2 +- fs/btrfs/inode.c | 57 +++++++++++++++++++++++---------------------- fs/btrfs/ioctl.c | 6 ++--- fs/btrfs/reflink.c | 2 +- fs/btrfs/tree-log.c | 12 +++++----- fs/btrfs/xattr.c | 4 ++-- 10 files changed, 51 insertions(+), 51 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 35be6dbca5e8..ccc3271c20ca 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2405,7 +2405,7 @@ again: * time. */ BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ccc2339c6568..ea443d3215d0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3112,8 +3112,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); + struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 096be647c234..39beab4c6d74 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2741,7 +2741,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, cur_offset = drop_args.drop_end; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) break; @@ -2978,7 +2978,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -3005,7 +3005,7 @@ out_only_mutex: } else { int ret2; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; @@ -3074,7 +3074,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode->i_ctime = current_time(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d8010df20b58..572c75d2169b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -278,7 +278,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); fail: if (locked) @@ -1193,7 +1193,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1383,7 +1383,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 76d2e43817ea..8cf39402b227 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -463,7 +463,7 @@ again: } BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_put; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1b2bb73d1d99..ed0249bdaf46 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -382,7 +382,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, } btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, &inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -3657,7 +3657,8 @@ failed: * copy everything in the in-memory inode into the btree. */ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) + struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3669,18 +3670,18 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!btrfs_is_free_space_inode(BTRFS_I(inode)) + if (!btrfs_is_free_space_inode(inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); - ret = btrfs_delayed_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) - btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); + btrfs_set_inode_last_trans(trans, inode); return ret; } - return btrfs_update_inode_item(trans, root, BTRFS_I(inode)); + return btrfs_update_inode_item(trans, root, inode); } noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, @@ -3689,7 +3690,7 @@ noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, { int ret; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret == -ENOSPC) return btrfs_update_inode_item(trans, root, BTRFS_I(inode)); return ret; @@ -3799,7 +3800,7 @@ err: inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, &dir->vfs_inode); + ret = btrfs_update_inode(trans, root, dir); out: return ret; } @@ -3813,7 +3814,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, &inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); } return ret; } @@ -4836,7 +4837,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(BTRFS_I(inode), 0, drop_args.bytes_found); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } btrfs_end_transaction(trans); return ret; @@ -4998,7 +4999,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5910,7 +5911,7 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret && ret == -ENOSPC) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); @@ -5918,7 +5919,7 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } btrfs_end_transaction(trans); if (BTRFS_I(inode)->delayed_node) @@ -6306,7 +6307,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, parent_inode->vfs_inode.i_mtime = now; parent_inode->vfs_inode.i_ctime = now; } - ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); + ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6397,7 +6398,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); d_instantiate_new(dentry, inode); out_unlock: @@ -6456,7 +6457,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_unlock; @@ -6528,7 +6529,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { @@ -6596,7 +6597,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; btrfs_i_size_write(BTRFS_I(inode), 0); - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_fail; @@ -8521,7 +8522,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) break; @@ -8567,7 +8568,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret2 && !ret) ret = ret2; @@ -8613,7 +8614,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, "error inheriting subvolume %llu properties: %d", new_root->root_key.objectid, err); - err = btrfs_update_inode(trans, new_root, inode); + err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); iput(inode); return err; @@ -8969,7 +8970,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, root, old_inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8985,7 +8986,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, dest, new_inode); + ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9105,7 +9106,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: unlock_new_inode(inode); if (ret) @@ -9239,7 +9240,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, root, old_inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9599,7 +9600,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); /* * Last step, add directory indexes for our symlink inode. This is the * last step to avoid extra cleanup of these indexes if an error happens @@ -9796,7 +9797,7 @@ next: btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -9892,7 +9893,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (ret) goto out; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) goto out; ret = btrfs_orphan_add(trans, BTRFS_I(inode)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e7ccba83e8a5..a5dc7cc5d705 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -336,7 +336,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); @@ -479,7 +479,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); btrfs_end_transaction(trans); @@ -733,7 +733,7 @@ static noinline int create_subvol(struct inode *dir, } btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 952b8f33619c..be16acd87588 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -34,7 +34,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 360732a93980..955c9a36cfeb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -834,7 +834,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: if (inode) iput(inode); @@ -1533,7 +1533,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; @@ -1708,7 +1708,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1814,7 +1814,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, set_nlink(inode, 1); else inc_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; } else { @@ -1967,7 +1967,7 @@ out: btrfs_release_path(path); if (!ret && update_size) { btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); - ret = btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } kfree(name); iput(dir); @@ -2602,7 +2602,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, - root, inode); + root, BTRFS_I(inode)); } iput(inode); if (ret) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 95d9aebff2c4..f32fe9e39a74 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -239,7 +239,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); BUG_ON(ret); out: btrfs_end_transaction(trans); @@ -390,7 +390,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (!ret) { inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); BUG_ON(ret); } -- cgit v1.2.3-59-g8ed1b From f2f121ab500d0457cc9c6f54269d21ffdf5bd304 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Nov 2020 11:21:49 +0000 Subject: btrfs: skip unnecessary searches for xattrs when logging an inode Every time we log an inode we lookup in the fs/subvol tree for xattrs and if we have any, log them into the log tree. However it is very common to have inodes without any xattrs, so doing the search wastes times, but more importantly it adds contention on the fs/subvol tree locks, either making the logging code block and wait for tree locks or making the logging code making other concurrent operations block and wait. The most typical use cases where xattrs are used are when capabilities or ACLs are defined for an inode, or when SELinux is enabled. This change makes the logging code detect when an inode does not have xattrs and skip the xattrs search the next time the inode is logged, unless the inode is evicted and loaded again or a xattr is added to the inode. Therefore skipping the search for xattrs on inodes that don't ever have xattrs and are fsynced with some frequency. The following script that calls dbench was used to measure the impact of this change on a VM with 8 CPUs, 16Gb of ram, using a raw NVMe device directly (no intermediary filesystem on the host) and using a non-debug kernel (default configuration on Debian distributions): $ cat test.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" mkfs.btrfs -f -m single -d single $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 200 40 umount $MNT The results before this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 5761605 0.172 312.057 Close 4232452 0.002 10.927 Rename 243937 1.406 277.344 Unlink 1163456 0.631 298.402 Deltree 160 11.581 221.107 Mkdir 80 0.003 0.005 Qpathinfo 5221410 0.065 122.309 Qfileinfo 915432 0.001 3.333 Qfsinfo 957555 0.003 3.992 Sfileinfo 469244 0.023 20.494 Find 2018865 0.448 123.659 WriteX 2874851 0.049 118.529 ReadX 9030579 0.004 21.654 LockX 18754 0.003 4.423 UnlockX 18754 0.002 0.331 Flush 403792 10.944 359.494 Throughput 908.444 MB/sec 40 clients 40 procs max_latency=359.500 ms The results after this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 6442521 0.159 230.693 Close 4732357 0.002 10.972 Rename 272809 1.293 227.398 Unlink 1301059 0.563 218.500 Deltree 160 7.796 54.887 Mkdir 80 0.008 0.478 Qpathinfo 5839452 0.047 124.330 Qfileinfo 1023199 0.001 4.996 Qfsinfo 1070760 0.003 5.709 Sfileinfo 524790 0.033 21.765 Find 2257658 0.314 125.611 WriteX 3211520 0.040 232.135 ReadX 10098969 0.004 25.340 LockX 20974 0.003 1.569 UnlockX 20974 0.002 3.475 Flush 451553 10.287 331.037 Throughput 1011.77 MB/sec 40 clients 40 procs max_latency=331.045 ms +10.8% throughput, -8.2% max latency Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 7 +++++++ fs/btrfs/tree-log.c | 8 ++++++++ fs/btrfs/xattr.c | 4 +++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b4c09a12659c..555cbcef6585 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -35,6 +35,13 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, + /* + * Set and used when logging an inode and it serves to signal that an + * inode does not have xattrs, so subsequent fsyncs can avoid searching + * for xattrs to log. This bit must be cleared whenever a xattr is added + * to an inode. + */ + BTRFS_INODE_NO_XATTRS, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955c9a36cfeb..4b8f190c39bb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4573,6 +4573,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, const u64 ino = btrfs_ino(inode); int ins_nr = 0; int start_slot = 0; + bool found_xattrs = false; + + if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) + return 0; key.objectid = ino; key.type = BTRFS_XATTR_ITEM_KEY; @@ -4611,6 +4615,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, start_slot = slot; ins_nr++; path->slots[0]++; + found_xattrs = true; cond_resched(); } if (ins_nr > 0) { @@ -4620,6 +4625,9 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, return ret; } + if (!found_xattrs) + set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); + return 0; } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index f32fe9e39a74..af6246f36a9e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -213,9 +213,11 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, } out: btrfs_free_path(path); - if (!ret) + if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags); + } return ret; } -- cgit v1.2.3-59-g8ed1b From bc5b5b1e5111005363094da1d5f5ffb0e83165f1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Nov 2020 11:23:30 +0000 Subject: btrfs: stop incrementing log batch when joining log transaction When joining a log transaction we acquire the root's log mutex, then increment the root's log batch and log writers counters while holding the mutex. However we don't need to increment the log batch there, because we are holding the mutex and incremented the log writers counter as well, so any other task trying to sync log will wait for the current task to finish its logging and still achieve the desired log batching. Since the log batch counter is an atomic counter and is incremented twice at the very beginning of the fsync callback (btrfs_sync_file()), once before flushing delalloc and once again after waiting for writeback to complete, eliminating its increment when joining the log transaction may provide some performance gains in case we have multiple concurrent tasks doing fsyncs against different files in the same subvolume, as it reduces contention on the atomic (locking the cacheline and bouncing it). When testing fio with 32 jobs, on a 8 cores VM, doing fsyncs against different files of the same subvolume, on top of a zram device, I could consistently see gains (higher throughput) between 1% to 2%, which is a very low value and possibly hard to be observed with a real device (I couldn't observe consistent gains with my low/mid end NVMe device). So this change is mostly motivated to just simplify the logic, as updating the log batch counter is only relevant when an fsync starts and while not holding the root's log mutex. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4b8f190c39bb..80cf496226c2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -172,7 +172,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root->log_start_pid = current->pid; } - atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; -- cgit v1.2.3-59-g8ed1b From 5297199a8bca12b8b96afcbf2341605efb6005de Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 26 Nov 2020 15:10:39 +0200 Subject: btrfs: remove inode number cache feature It's been deprecated since commit b547a88ea577 ("btrfs: start deprecation of mount option inode_cache") which enumerates the reasons. A filesystem that uses the feature (mount -o inode_cache) tracks the inode numbers in bitmaps, that data stay on the filesystem after this patch. The size is roughly 5MiB for 1M inodes [1], which is considered small enough to be left there. Removal of the change can be implemented in btrfs-progs if needed. [1] https://lore.kernel.org/linux-btrfs/20201127145836.GZ6430@twin.jikos.cz/ Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 15 +- fs/btrfs/disk-io.c | 23 -- fs/btrfs/free-space-cache.c | 177 --------------- fs/btrfs/free-space-cache.h | 12 - fs/btrfs/inode-map.c | 527 -------------------------------------------- fs/btrfs/inode-map.h | 13 -- fs/btrfs/inode.c | 11 - fs/btrfs/ioctl.c | 1 - fs/btrfs/relocation.c | 1 - fs/btrfs/super.c | 13 +- fs/btrfs/transaction.c | 19 -- fs/btrfs/tree-log.c | 1 - 13 files changed, 6 insertions(+), 809 deletions(-) delete mode 100644 fs/btrfs/inode-map.c delete mode 100644 fs/btrfs/inode-map.h (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0497fdc37f90..9f1b1a88e317 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - file-item.o inode-item.o inode-map.o disk-io.o \ + file-item.o inode-item.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7fe74f6a2486..a0c588e5fc89 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1077,15 +1077,6 @@ struct btrfs_root { spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; - /* free ino cache stuff */ - struct btrfs_free_space_ctl *free_ino_ctl; - enum btrfs_caching_type ino_cache_state; - spinlock_t ino_cache_lock; - wait_queue_head_t ino_cache_wait; - struct btrfs_free_space_ctl *free_ino_pinned; - u64 ino_cache_progress; - struct inode *ino_cache_inode; - struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; @@ -1359,7 +1350,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) -#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +/* bit 17 is free */ #define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) @@ -1406,9 +1397,7 @@ do { \ * transaction commit) */ -#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) -#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) -#define BTRFS_PENDING_COMMIT (2) +#define BTRFS_PENDING_COMMIT (0) #define btrfs_test_pending(info, opt) \ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3b456c3331a2..2d8bcd075c77 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" @@ -1336,14 +1335,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) int ret; unsigned int nofs_flag; - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; - goto fail; - } - /* * We might be called under a transaction (e.g. indirect backref * resolution) which could deadlock if it triggers memory reclaim @@ -1360,10 +1351,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_check_and_init_root_item(&root->root_item); } - btrfs_init_free_ino_ctl(root); - spin_lock_init(&root->ino_cache_lock); - init_waitqueue_head(&root->ino_cache_wait); - /* * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M @@ -2033,8 +2020,6 @@ void btrfs_put_root(struct btrfs_root *root) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); @@ -3988,14 +3973,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, } } - if (root->free_ino_pinned) - __btrfs_remove_free_space_cache(root->free_ino_pinned); - if (root->free_ino_ctl) - __btrfs_remove_free_space_cache(root->free_ino_ctl); - if (root->ino_cache_inode) { - iput(root->ino_cache_inode); - root->ino_cache_inode = NULL; - } if (drop_ref) btrfs_put_root(root); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 572c75d2169b..7d1ceeeb3067 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,7 +16,6 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "inode-map.h" #include "volumes.h" #include "space-info.h" #include "delalloc-space.h" @@ -37,10 +36,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path); static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc); @@ -1222,14 +1217,6 @@ out: } -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path) -{ - return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0); -} - int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) @@ -3807,170 +3794,6 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, return ret; } -/* - * Find the left-most item in the cache tree, and then return the - * smallest inode number in the item. - * - * Note: the returned inode number may not be the smallest one in - * the tree, if the left-most item is a bitmap. - */ -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) -{ - struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; - struct btrfs_free_space *entry = NULL; - u64 ino = 0; - - spin_lock(&ctl->tree_lock); - - if (RB_EMPTY_ROOT(&ctl->free_space_offset)) - goto out; - - entry = rb_entry(rb_first(&ctl->free_space_offset), - struct btrfs_free_space, offset_index); - - if (!entry->bitmap) { - ino = entry->offset; - - unlink_free_space(ctl, entry); - entry->offset++; - entry->bytes--; - if (!entry->bytes) - kmem_cache_free(btrfs_free_space_cachep, entry); - else - link_free_space(ctl, entry); - } else { - u64 offset = 0; - u64 count = 1; - int ret; - - ret = search_bitmap(ctl, entry, &offset, &count, true); - /* Logic error; Should be empty if it can't find anything */ - ASSERT(!ret); - - ino = offset; - bitmap_clear_bits(ctl, entry, offset, 1); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } -out: - spin_unlock(&ctl->tree_lock); - - return ino; -} - -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct inode *inode = NULL; - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_inode) - inode = igrab(root->ino_cache_inode); - spin_unlock(&root->ino_cache_lock); - if (inode) - return inode; - - inode = __lookup_free_space_inode(root, path, 0); - if (IS_ERR(inode)) - return inode; - - spin_lock(&root->ino_cache_lock); - if (!btrfs_fs_closing(root->fs_info)) - root->ino_cache_inode = igrab(inode); - spin_unlock(&root->ino_cache_lock); - - return inode; -} - -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - return __create_free_space_inode(root, trans, path, - BTRFS_FREE_INO_OBJECTID, 0); -} - -int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - int ret = 0; - u64 root_gen = btrfs_root_generation(&root->root_item); - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. - */ - if (btrfs_fs_closing(fs_info)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return 0; - - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - goto out; - - if (root_gen != BTRFS_I(inode)->generation) - goto out_put; - - ret = __load_free_space_cache(root, inode, ctl, path, 0); - - if (ret < 0) - btrfs_err(fs_info, - "failed to load free ino cache for root %llu", - root->root_key.objectid); -out_put: - iput(inode); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - int ret; - struct btrfs_io_ctl io_ctl; - bool release_metadata = true; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - memset(&io_ctl, 0, sizeof(io_ctl)); - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans); - if (!ret) { - /* - * At this point writepages() didn't error out, so our metadata - * reservation is released when the writeback finishes, at - * inode.c:btrfs_finish_ordered_io(), regardless of it finishing - * with or without an error. - */ - release_metadata = false; - ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path); - } - - if (ret) { - if (release_metadata) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - inode->i_size, true); - btrfs_debug(fs_info, - "failed to write free ino cache for root %llu error %d", - root->root_key.objectid, ret); - } - - return ret; -} - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index bf8d127d2407..66a87242690e 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -97,17 +97,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path); -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path); -int load_free_ino_cache(struct btrfs_fs_info *fs_info, - struct btrfs_root *root); -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode); void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); @@ -127,7 +116,6 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes); int btrfs_find_space_cluster(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c deleted file mode 100644 index 2bc342fc5d7e..000000000000 --- a/fs/btrfs/inode-map.c +++ /dev/null @@ -1,527 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Oracle. All rights reserved. - */ - -#include -#include - -#include "ctree.h" -#include "disk-io.h" -#include "free-space-cache.h" -#include "inode-map.h" -#include "transaction.h" -#include "delalloc-space.h" - -static void fail_caching_thread(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - - btrfs_warn(fs_info, "failed to start inode caching task"); - btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, - "disabling inode map caching"); - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_ERROR; - spin_unlock(&root->ino_cache_lock); - wake_up(&root->ino_cache_wait); -} - -static int caching_kthread(void *data) -{ - struct btrfs_root *root = data; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - u64 last = (u64)-1; - int slot; - int ret; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) { - fail_caching_thread(root); - return -ENOMEM; - } - - /* Since the commit root is read-only, we can safely skip locking. */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = READA_FORWARD; - - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; -again: - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - if (btrfs_fs_closing(fs_info)) - goto out; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - - if (need_resched() || - btrfs_transaction_in_commit(fs_info)) { - leaf = path->nodes[0]; - - if (WARN_ON(btrfs_header_nritems(leaf) == 0)) - break; - - /* - * Save the key so we can advances forward - * in the next search. - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); - schedule_timeout(1); - goto again; - } else - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.type != BTRFS_INODE_ITEM_KEY) - goto next; - - if (key.objectid >= root->highest_objectid) - break; - - if (last != (u64)-1 && last + 1 != key.objectid) { - __btrfs_add_free_space(fs_info, ctl, last + 1, - key.objectid - last - 1, 0); - wake_up(&root->ino_cache_wait); - } - - last = key.objectid; -next: - path->slots[0]++; - } - - if (last < root->highest_objectid - 1) { - __btrfs_add_free_space(fs_info, ctl, last + 1, - root->highest_objectid - last - 1, 0); - } - - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_FINISHED; - spin_unlock(&root->ino_cache_lock); - - root->ino_cache_progress = (u64)-1; - btrfs_unpin_free_ino(root); -out: - wake_up(&root->ino_cache_wait); - up_read(&fs_info->commit_root_sem); - - btrfs_free_path(path); - - return ret; -} - -static void start_caching(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct task_struct *tsk; - int ret; - u64 objectid; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return; - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state != BTRFS_CACHE_NO) { - spin_unlock(&root->ino_cache_lock); - return; - } - - root->ino_cache_state = BTRFS_CACHE_STARTED; - spin_unlock(&root->ino_cache_lock); - - ret = load_free_ino_cache(fs_info, root); - if (ret == 1) { - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_FINISHED; - spin_unlock(&root->ino_cache_lock); - wake_up(&root->ino_cache_wait); - return; - } - - /* - * It can be quite time-consuming to fill the cache by searching - * through the extent tree, and this can keep ino allocation path - * waiting. Therefore at start we quickly find out the highest - * inode number and we know we can use inode numbers which fall in - * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. - */ - ret = btrfs_find_free_objectid(root, &objectid); - if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { - __btrfs_add_free_space(fs_info, ctl, objectid, - BTRFS_LAST_FREE_OBJECTID - objectid + 1, - 0); - wake_up(&root->ino_cache_wait); - } - - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", - root->root_key.objectid); - if (IS_ERR(tsk)) - fail_caching_thread(root); -} - -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) -{ - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) - return btrfs_find_free_objectid(root, objectid); - -again: - *objectid = btrfs_find_ino_for_alloc(root); - - if (*objectid != 0) - return 0; - - start_caching(root); - - wait_event(root->ino_cache_wait, - root->ino_cache_state == BTRFS_CACHE_FINISHED || - root->ino_cache_state == BTRFS_CACHE_ERROR || - root->free_ino_ctl->free_space > 0); - - if (root->ino_cache_state == BTRFS_CACHE_FINISHED && - root->free_ino_ctl->free_space == 0) - return -ENOSPC; - else if (root->ino_cache_state == BTRFS_CACHE_ERROR) - return btrfs_find_free_objectid(root, objectid); - else - goto again; -} - -void btrfs_return_ino(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return; -again: - if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); - } else { - down_write(&fs_info->commit_root_sem); - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - spin_unlock(&root->ino_cache_lock); - up_write(&fs_info->commit_root_sem); - goto again; - } - spin_unlock(&root->ino_cache_lock); - - start_caching(root); - - __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); - - up_write(&fs_info->commit_root_sem); - } -} - -/* - * When a transaction is committed, we'll move those inode numbers which are - * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and - * others will just be dropped, because the commit root we were searching has - * changed. - * - * Must be called with root->fs_info->commit_root_sem held - */ -void btrfs_unpin_free_ino(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; - spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock; - struct btrfs_free_space *info; - struct rb_node *n; - u64 count; - - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) - return; - - while (1) { - spin_lock(rbroot_lock); - n = rb_first(rbroot); - if (!n) { - spin_unlock(rbroot_lock); - break; - } - - info = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(info->bitmap); /* Logic error */ - - if (info->offset > root->ino_cache_progress) - count = 0; - else - count = min(root->ino_cache_progress - info->offset + 1, - info->bytes); - - rb_erase(&info->offset_index, rbroot); - spin_unlock(rbroot_lock); - if (count) - __btrfs_add_free_space(root->fs_info, ctl, - info->offset, count, 0); - kmem_cache_free(btrfs_free_space_cachep, info); - } -} - -#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) -#define INODES_PER_BITMAP (PAGE_SIZE * 8) - -/* - * The goal is to keep the memory used by the free_ino tree won't - * exceed the memory if we use bitmaps only. - */ -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *n; - int max_ino; - int max_bitmaps; - - n = rb_last(&ctl->free_space_offset); - if (!n) { - ctl->extents_thresh = INIT_THRESHOLD; - return; - } - info = rb_entry(n, struct btrfs_free_space, offset_index); - - /* - * Find the maximum inode number in the filesystem. Note we - * ignore the fact that this can be a bitmap, because we are - * not doing precise calculation. - */ - max_ino = info->bytes - 1; - - max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; - if (max_bitmaps <= ctl->total_bitmaps) { - ctl->extents_thresh = 0; - return; - } - - ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * - PAGE_SIZE / sizeof(*info); -} - -/* - * We don't fall back to bitmap, if we are below the extents threshold - * or this chunk of inode numbers is a big one. - */ -static bool use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - if (ctl->free_extents < ctl->extents_thresh || - info->bytes > INODES_PER_BITMAP / 10) - return false; - - return true; -} - -static const struct btrfs_free_space_op free_ino_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - -static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) -{ -} - -static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - /* - * We always use extents for two reasons: - * - * - The pinned tree is only used during the process of caching - * work. - * - Make code simpler. See btrfs_unpin_free_ino(). - */ - return false; -} - -static const struct btrfs_free_space_op pinned_free_ino_op = { - .recalc_thresholds = pinned_recalc_thresholds, - .use_bitmap = pinned_use_bitmap, -}; - -void btrfs_init_free_ino_ctl(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - spin_lock_init(&ctl->tree_lock); - ctl->unit = 1; - ctl->start = 0; - ctl->private = NULL; - ctl->op = &free_ino_op; - INIT_LIST_HEAD(&ctl->trimming_ranges); - mutex_init(&ctl->cache_writeout_mutex); - - /* - * Initially we allow to use 16K of ram to cache chunks of - * inode numbers before we resort to bitmaps. This is somewhat - * arbitrary, but it will be adjusted in runtime. - */ - ctl->extents_thresh = INIT_THRESHOLD; - - spin_lock_init(&pinned->tree_lock); - pinned->unit = 1; - pinned->start = 0; - pinned->private = NULL; - pinned->extents_thresh = 0; - pinned->op = &pinned_free_ino_op; -} - -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - struct btrfs_block_rsv *rsv; - struct extent_changeset *data_reserved = NULL; - u64 num_bytes; - u64 alloc_hint = 0; - int ret; - int prealloc; - bool retry = false; - - /* only fs tree and subvol/snap needs ino cache */ - if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && - (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || - root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) - return 0; - - /* Don't save inode cache if we are deleting this root */ - if (btrfs_root_refs(&root->root_item) == 0) - return 0; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - rsv = trans->block_rsv; - trans->block_rsv = &fs_info->trans_block_rsv; - - num_bytes = trans->bytes_reserved; - /* - * 1 item for inode item insertion if need - * 4 items for inode item update (in the worst case) - * 1 items for slack space if we need do truncation - * 1 item for free space object - * 3 items for pre-allocation - */ - trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10); - ret = btrfs_block_rsv_add(root, trans->block_rsv, - trans->bytes_reserved, - BTRFS_RESERVE_NO_FLUSH); - if (ret) - goto out; - trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, - trans->bytes_reserved, 1); -again: - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { - ret = PTR_ERR(inode); - goto out_release; - } - - if (IS_ERR(inode)) { - BUG_ON(retry); /* Logic error */ - retry = true; - - ret = create_free_ino_inode(root, trans, path); - if (ret) - goto out_release; - goto again; - } - - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_put; - } - - if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(trans, NULL, inode); - if (ret) { - if (ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - goto out_put; - } - } - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state != BTRFS_CACHE_FINISHED) { - ret = -1; - spin_unlock(&root->ino_cache_lock); - goto out_put; - } - spin_unlock(&root->ino_cache_lock); - - spin_lock(&ctl->tree_lock); - prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; - prealloc = ALIGN(prealloc, PAGE_SIZE); - prealloc += ctl->total_bitmaps * PAGE_SIZE; - spin_unlock(&ctl->tree_lock); - - /* Just to make sure we have enough space */ - prealloc += 8 * PAGE_SIZE; - - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0, - prealloc); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, - prealloc, prealloc, &alloc_hint); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); - btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); - goto out_put; - } - - ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); -out_put: - iput(inode); -out_release: - trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, - trans->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved, NULL); -out: - trans->block_rsv = rsv; - trans->bytes_reserved = num_bytes; - - btrfs_free_path(path); - extent_changeset_free(data_reserved); - return ret; -} diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h deleted file mode 100644 index 629baf9aefb1..000000000000 --- a/fs/btrfs/inode-map.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef BTRFS_INODE_MAP_H -#define BTRFS_INODE_MAP_H - -void btrfs_init_free_ino_ctl(struct btrfs_root *root); -void btrfs_unpin_free_ino(struct btrfs_root *root); -void btrfs_return_ino(struct btrfs_root *root, u64 objectid); -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans); - -#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2947a00d442..0ce42d52d53e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -45,7 +45,6 @@ #include "compression.h" #include "locking.h" #include "free-space-cache.h" -#include "inode-map.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -4214,12 +4213,6 @@ out_up_write: d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); - - /* the last ref */ - if (dest->ino_cache_inode) { - iput(dest->ino_cache_inode); - dest->ino_cache_inode = NULL; - } } return ret; @@ -5295,10 +5288,6 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans); } - if (!(root == fs_info->tree_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) - btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode))); - free_rsv: btrfs_free_block_rsv(fs_info, rsv); no_delete: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b5e1df24906..703212ff50a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -34,7 +34,6 @@ #include "print-tree.h" #include "volumes.h" #include "locking.h" -#include "inode-map.h" #include "backref.h" #include "rcu-string.h" #include "send.h" diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6e7e5fe2e277..19b7db8b2117 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -18,7 +18,6 @@ #include "btrfs_inode.h" #include "async-thread.h" #include "free-space-cache.h" -#include "inode-map.h" #include "qgroup.h" #include "print-tree.h" #include "delalloc-space.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7b2970c90ebe..455d924b8d62 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -338,7 +338,6 @@ enum { Opt_device, Opt_fatal_errors, Opt_flushoncommit, Opt_noflushoncommit, - Opt_inode_cache, Opt_noinode_cache, Opt_max_inline, Opt_barrier, Opt_nobarrier, Opt_datacow, Opt_nodatacow, @@ -371,6 +370,7 @@ enum { /* Deprecated options */ Opt_recovery, + Opt_inode_cache, Opt_noinode_cache, /* Debugging options */ Opt_check_integrity, @@ -880,14 +880,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_inode_cache: - btrfs_warn(info, - "the 'inode_cache' option is deprecated and will have no effect from 5.11"); - btrfs_set_pending_and_info(info, INODE_MAP_CACHE, - "enabling inode map caching"); - break; case Opt_noinode_cache: - btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, - "disabling inode map caching"); + btrfs_warn(info, + "the 'inode_cache' option is deprecated and has no effect since 5.11"); break; case Opt_clear_cache: btrfs_set_and_info(info, CLEAR_CACHE, @@ -1506,8 +1501,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",enospc_debug"); if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(info, INODE_MAP_CACHE)) - seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 238e5a2c0696..6b7156712486 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -16,7 +16,6 @@ #include "transaction.h" #include "locking.h" #include "tree-log.h" -#include "inode-map.h" #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" @@ -163,8 +162,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - if (is_fstree(root->root_key.objectid)) - btrfs_unpin_free_ino(root); extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -1342,8 +1339,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); - btrfs_save_ino_cache(root, trans); - /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_mb__after_atomic(); @@ -2436,10 +2431,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); - if (root->ino_cache_inode) { - iput(root->ino_cache_inode); - root->ino_cache_inode = NULL; - } if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) @@ -2460,16 +2451,6 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) if (!prev) return; - bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; - if (prev & bit) - btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); - prev &= ~bit; - - bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; - if (prev & bit) - btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); - prev &= ~bit; - bit = 1 << BTRFS_PENDING_COMMIT; if (prev & bit) btrfs_debug(fs_info, "pending commit done"); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 80cf496226c2..485267ee5bc6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -17,7 +17,6 @@ #include "backref.h" #include "compression.h" #include "qgroup.h" -#include "inode-map.h" #include "block-group.h" #include "space-info.h" -- cgit v1.2.3-59-g8ed1b From de53d892e5c51dfa0a158e812575a75a6c991f39 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:23 +0000 Subject: btrfs: fix race causing unnecessary inode logging during link and rename When we are doing a rename or a link operation for an inode that was logged in the previous transaction and that transaction is still committing, we have a time window where we incorrectly consider that the inode was logged previously in the current transaction and therefore decide to log it to update it in the log. The following steps give an example on how this happens during a link operation: 1) Inode X is logged in transaction 1000, so its logged_trans field is set to 1000; 2) Task A starts to commit transaction 1000; 3) The state of transaction 1000 is changed to TRANS_STATE_UNBLOCKED; 4) Task B starts a link operation for inode X, and as a consequence it starts transaction 1001; 5) Task A is still committing transaction 1000, therefore the value stored at fs_info->last_trans_committed is still 999; 6) Task B calls btrfs_log_new_name(), it reads a value of 999 from fs_info->last_trans_committed and because the logged_trans field of inode X has a value of 1000, the function does not return immediately, instead it proceeds to logging the inode, which should not happen because the inode was logged in the previous transaction (1000) and not in the current one (1001). This is not a functional problem, just wasted time and space logging an inode that does not need to be logged, contributing to higher latency for link and rename operations. So fix this by comparing the inodes' logged_trans field with the generation of the current transaction instead of comparing with the value stored in fs_info->last_trans_committed. This case is often hit when running dbench for a long enough duration, as it does lots of rename operations. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 485267ee5bc6..9ab05ebd8854 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6442,7 +6442,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_log_ctx ctx; /* @@ -6456,8 +6455,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (inode->logged_trans <= fs_info->last_trans_committed && - (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) + if (inode->logged_trans < trans->transid && + (!old_dir || old_dir->logged_trans < trans->transid)) return; btrfs_init_log_ctx(&ctx, &inode->vfs_inode); -- cgit v1.2.3-59-g8ed1b From 5f96bfb7633c55b578c6b32f32624061f25010db Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:24 +0000 Subject: btrfs: fix race that results in logging old extents during a fast fsync When logging the extents of an inode during a fast fsync, we have a time window where we can log extents that are from the previous transaction and already persisted. This only makes us waste time unnecessarily. The following sequence of steps shows how this can happen: 1) We are at transaction 1000; 2) An ordered extent E from inode I completes, that is it has gone through btrfs_finish_ordered_io(), and it set the extent maps' generation to 1000 when we unpin the extent, which is the generation of the current transaction; 3) The commit for transaction 1000 starts by task A; 4) The task committing transaction 1000 sets the transaction state to unblocked, writes the dirty extent buffers and the super blocks, then unlocks tree_log_mutex; 5) Some change is made to inode I, resulting in creation of a new transaction with a generation of 1001; 6) The transaction 1000 commit starts unpinning extents. At this point fs_info->last_trans_committed still has a value of 999; 7) Task B starts an fsync on inode I, and when it gets to btrfs_log_changed_extents() sees the extent map for extent E in the list of modified extents. It sees the extent map has a generation of 1000 and fs_info->last_trans_committed has a value of 999, so it proceeds to logging the respective file extent item and all the checksums covering its range. So we end up wasting time since the extent was already persisted and is reachable through the trees pointed to by the super block committed by transaction 1000. So just fix this by comparing the extent maps generation against the generation of the transaction handle - if it is smaller then the id in the handle, we know the extent was already persisted and we do not need to log it. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9ab05ebd8854..6ea7df1beee2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4415,14 +4415,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 test_gen; int ret = 0; int num = 0; INIT_LIST_HEAD(&extents); write_lock(&tree->lock); - test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); @@ -4438,7 +4436,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, goto process; } - if (em->generation <= test_gen) + if (em->generation < trans->transid) continue; /* We log prealloc extents beyond eof later. */ -- cgit v1.2.3-59-g8ed1b From 4d6221d7d83141d58ece6560e9cfd4cc92eab044 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:25 +0000 Subject: btrfs: fix race that causes unnecessary logging of ancestor inodes When logging an inode and we are checking if we need to log ancestors that are new, if the previous transaction is still committing we have a time window where we can unnecessarily log ancestor inodes that were created in the previous transaction. The race is described by the following steps: 1) We are at transaction 1000; 2) Directory inode X is created, its generation is set to 1000; 3) The commit for transaction 1000 is started by task A; 4) The task committing transaction 1000 sets the transaction state to unblocked, writes the dirty extent buffers and the super blocks, then unlocks tree_log_mutex; 5) Inode Y, a regular file, is created under directory inode X, this results in starting a new transaction with a generation of 1001; 6) The transaction 1000 commit is unpinning extents. At this point fs_info->last_trans_committed still has a value of 999; 7) Task B calls fsync on inode Y and gets a handle for transaction 1001; 8) Task B ends up at log_all_new_ancestors() and then because inode Y has only one hard link, ends up at log_new_ancestors_fast(). There it reads a value of 999 from fs_info->last_trans_committed, and sees that the parent inode X has a generation of 1000, so we end up logging inode X: if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); (...) which is not necessary since it was created in the past transaction, with a generation of 1000, and that transaction has already committed its super blocks - it's still unpinning extents so it has not yet updated fs_info->last_trans_committed from 999 to 1000. So this just causes us to spend more time logging and allocating and writing more tree blocks for the log tree. So fix this by comparing an inode's generation with the generation of the transaction our transaction handle refers to - if the inode's generation matches the generation of the current transaction than we know it is a new inode we need to log, otherwise don't log it. This case is often hit when running dbench for a long enough duration. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6ea7df1beee2..f21bef62e349 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5834,7 +5834,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, while (true) { struct btrfs_fs_info *fs_info = root->fs_info; - const u64 last_committed = fs_info->last_trans_committed; struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; struct btrfs_key search_key; @@ -5853,7 +5852,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation > last_committed) + if (BTRFS_I(inode)->generation >= trans->transid) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5894,7 +5893,6 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct dentry *old_parent = NULL; struct super_block *sb = inode->vfs_inode.i_sb; int ret = 0; @@ -5908,7 +5906,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation > fs_info->last_trans_committed) { + if (inode->generation >= trans->transid) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) -- cgit v1.2.3-59-g8ed1b From 47d3db41e190ca4a9c6e4a848052f4c5ca633db1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:26 +0000 Subject: btrfs: fix race that makes inode logging fallback to transaction commit When logging an inode and the previous transaction is still committing, we have a time window where we can end up incorrectly think an inode has its last_unlink_trans field with a value greater than the last transaction committed, which results in the logging to fallback to a full transaction commit, which is usually much more expensive than doing a log commit. The race is described by the following steps: 1) We are at transaction 1000; 2) We modify an inode X (a directory) using transaction 1000 and set its last_unlink_trans field to 1000, because for example we removed one of its subdirectories; 3) We create a new inode Y with a dentry in inode X using transaction 1000, so its generation field is set to 1000; 4) The commit for transaction 1000 is started by task A; 5) The task committing transaction 1000 sets the transaction state to unblocked, writes the dirty extent buffers and the super blocks, then unlocks tree_log_mutex; 6) Some task starts a new transaction with a generation of 1001; 7) We do some modification to inode Y (using transaction 1001); 8) The transaction 1000 commit starts unpinning extents. At this point fs_info->last_trans_committed still has a value of 999; 9) Task B starts an fsync on inode Y, and gets a handle for transaction 1001. When it gets to check_parent_dirs_for_sync() it does the checking of the ancestor dentries because the following check does not evaluate to true: if (S_ISREG(inode->vfs_inode.i_mode) && inode->generation <= last_committed && inode->last_unlink_trans <= last_committed) goto out; The generation value for inode Y is 1000 and last_committed, which has the value read from fs_info->last_trans_committed, has a value of 999, so that check evaluates to false and we proceed to check the ancestor inodes. Once we get to the first ancestor, inode X, we call btrfs_must_commit_transaction() on it, which evaluates to true: static bool btrfs_must_commit_transaction(...) { struct btrfs_fs_info *fs_info = inode->root->fs_info; bool ret = false; mutex_lock(&inode->log_mutex); if (inode->last_unlink_trans > fs_info->last_trans_committed) { /* * Make sure any commits to the log are forced to be full * commits. */ btrfs_set_log_full_commit(trans); ret = true; } (...) because inode's X last_unlink_trans has a value of 1000 and fs_info->last_trans_committed still has a value of 999, it returns true to check_parent_dirs_for_sync(), making it return 1 which is propagated up to btrfs_sync_file(), causing it to fallback to a full transaction commit of transaction 1001. We should have not fallen back to commit transaction 1001, since inode X had last_unlink_trans set to 1000 and the super blocks for transaction 1000 were already written. So while not resulting in a functional problem, it leads to a lot more work and higher latencies for a fsync since committing a transaction is usually more expensive than committing a log (if other filesystem changes happened under that transaction). Similar problem happens when logging directories, for the same reason as btrfs_must_commit_transaction() returns true on an inode with its last_unlink_trans having the generation of the previous transaction and that transaction is still committing, unpinning its freed extents. So fix this by comparing last_unlink_trans with the id of the current transaction instead of fs_info->last_trans_committed. This case is often hit when running dbench for a long enough duration, as it does lots of rename and rmdir operations (both update the field last_unlink_trans of an inode) and fsyncs of files and directories. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f21bef62e349..312704b20158 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5448,11 +5448,10 @@ out_unlock: static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; bool ret = false; mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans > fs_info->last_trans_committed) { + if (inode->last_unlink_trans >= trans->transid) { /* * Make sure any commits to the log are forced to be full * commits. @@ -5474,8 +5473,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - struct super_block *sb, - u64 last_committed) + struct super_block *sb) { int ret = 0; struct dentry *old_parent = NULL; @@ -5487,8 +5485,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * and other fun in this file. */ if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation <= last_committed && - inode->last_unlink_trans <= last_committed) + inode->generation < trans->transid && + inode->last_unlink_trans < trans->transid) goto out; if (!S_ISDIR(inode->vfs_inode.i_mode)) { @@ -6023,7 +6021,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; int ret = 0; - u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; sb = inode->vfs_inode.i_sb; @@ -6048,8 +6045,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb, - last_committed); + ret = check_parent_dirs_for_sync(trans, inode, parent, sb); if (ret) goto end_no_trans; @@ -6079,8 +6075,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * and other fun in this file. */ if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation <= last_committed && - inode->last_unlink_trans <= last_committed) { + inode->generation < trans->transid && + inode->last_unlink_trans < trans->transid) { ret = 0; goto end_trans; } @@ -6129,7 +6125,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * but the file inode does not have a matching BTRFS_INODE_REF_KEY item * and has a link count of 2. */ - if (inode->last_unlink_trans > last_committed) { + if (inode->last_unlink_trans >= trans->transid) { ret = btrfs_log_all_parents(trans, inode, ctx); if (ret) goto end_trans; -- cgit v1.2.3-59-g8ed1b From 639bd575b7c7fa326abadd2ef3e374a5a24eb40b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:27 +0000 Subject: btrfs: fix race leading to unnecessary transaction commit when logging inode When logging an inode we may often have to fallback to a full transaction commit, either because a new block group was allocated, there is some case we can not deal with without a transaction commit or some error like an ENOMEM happened. However after we fallback to a transaction commit, we have a time window where we can make the next attempt to log any inode commit the next transaction unnecessarily, adding additional overhead and increasing latency. A sequence of steps that leads to this issue is the following: 1) The current open transaction has a generation of 1000; 2) A new block group is allocated, and as a consequence we must make sure any attempts to commit a log fallback to a transaction commit, so btrfs_set_log_full_commit() is called from btrfs_make_block_group(). This sets fs_info->last_trans_log_full_commit to 1000; 3) Task A is holding a handle on transaction 1000 and tries to log inode X. Once it gets to start_log_trans(), it calls btrfs_need_log_full_commit() which returns true, since fs_info->last_trans_log_full_commit has a value of 1000. So we end up returning EAGAIN and propagating it up to btrfs_sync_file(), where we commit transaction 1000; 4) The transaction commit task (task A) sets the transaction state to unblocked (TRANS_STATE_UNBLOCKED); 5) Some other task, task B, starts a new transaction with a generation of 1001; 6) Some stuff is done with transaction 1001, some btree blocks COWed, etc; 7) Transaction 1000 has not fully committed yet, we are still writing all the extent buffers it created; 8) Some new task, task C, starts an fsync of inode Y, gets a handle for transaction 1001, and it gets to btrfs_log_inode_parent() which does the following check: if (fs_info->last_trans_log_full_commit > last_committed) { ret = 1; goto end_no_trans; } At that point last_trans_log_full_commit has a value of 1000 and last_committed (value of fs_info->last_trans_committed) has a value of 999, since transaction 1000 has not yet committed - it is either still writing out dirty extent buffers, its super blocks or unpinning extents. As a consequence we return 1, which gets propagated up to btrfs_sync_file(), which will then call btrfs_commit_transaction() for transaction 1001. As a consequence we have an unnecessary second transaction commit, we previously committed transaction 1000 and now commit transaction 1001 as well, resulting in more overhead and increased latency. So fix this double transaction commit issue simply by removing that check, because all we need to do is wait for the previous transaction to finish its commit, which we already do later when starting the log transaction at start_log_trans(), because there we acquire the tree_log_mutex lock, which is held by a transaction commit and only released after the transaction commits its super blocks. Another issue that check has is that it reads last_trans_log_full_commit without using READ_ONCE(), which is incorrect since that member of struct btrfs_fs_info is always updated with WRITE_ONCE() through the helper btrfs_set_log_full_commit(). This double transaction commit issue can actually be triggered quite often in long runs of dbench, since besides the creation of new block groups that force inode logging to fallback to a transaction commit, there are cases where dbench asks to fsync a directory which had files in it that were previously renamed or subdirectories that were removed, resulting in the inode logging to fallback to a full transaction commit. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 312704b20158..582911a36ee0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6030,16 +6030,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - /* - * The prev transaction commit doesn't complete, we need do - * full commit by ourselves. - */ - if (fs_info->last_trans_log_full_commit > - fs_info->last_trans_committed) { - ret = 1; - goto end_no_trans; - } - if (btrfs_root_refs(&root->root_item) == 0) { ret = 1; goto end_no_trans; -- cgit v1.2.3-59-g8ed1b From 47876f7ceffa0e6af7476e052b3c061f1f2c1d9f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 25 Nov 2020 12:19:28 +0000 Subject: btrfs: do not block inode logging for so long during transaction commit Early on during a transaction commit we acquire the tree_log_mutex and hold it until after we write the super blocks. But before writing the extent buffers dirtied by the transaction and the super blocks we unblock the transaction by setting its state to TRANS_STATE_UNBLOCKED and setting fs_info->running_transaction to NULL. This means that after that and before writing the super blocks, new transactions can start. However if any transaction wants to log an inode, it will block waiting for the transaction commit to write its dirty extent buffers and the super blocks because the tree_log_mutex is only released after those operations are complete, and starting a new log transaction blocks on that mutex (at start_log_trans()). Writing the dirty extent buffers and the super blocks can take a very significant amount of time to complete, but we could allow the tasks wanting to log an inode to proceed with most of their steps: 1) create the log trees 2) log metadata in the trees 3) write their dirty extent buffers They only need to wait for the previous transaction commit to complete (write its super blocks) before they attempt to write their super blocks, otherwise we could end up with a corrupt filesystem after a crash. So change start_log_trans() to use the root tree's log_mutex to serialize for the creation of the log root tree instead of using the tree_log_mutex, and make btrfs_sync_log() acquire the tree_log_mutex before writing the super blocks. This allows for inode logging to wait much less time when there is a previous transaction that is still committing, often not having to wait at all, as by the time when we try to sync the log the previous transaction already wrote its super blocks. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit The following script that uses dbench was used to measure the impact of the whole patchset: $ cat test-dbench.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd" echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f -m single -d single $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 300 64 umount $MNT The test was run on a machine with 12 cores, 64G of ram, using a NVMe device and a non-debug kernel configuration (Debian's default). Before patch set: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 11277211 0.250 85.340 Close 8283172 0.002 6.479 Rename 477515 1.935 86.026 Unlink 2277936 0.770 87.071 Deltree 256 15.732 81.379 Mkdir 128 0.003 0.009 Qpathinfo 10221180 0.056 44.404 Qfileinfo 1789967 0.002 4.066 Qfsinfo 1874399 0.003 9.176 Sfileinfo 918589 0.061 10.247 Find 3951758 0.341 54.040 WriteX 5616547 0.047 85.079 ReadX 17676028 0.005 9.704 LockX 36704 0.003 1.800 UnlockX 36704 0.002 0.687 Flush 790541 14.115 676.236 Throughput 1179.19 MB/sec 64 clients 64 procs max_latency=676.240 ms After patch set: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 12687926 0.171 86.526 Close 9320780 0.002 8.063 Rename 537253 1.444 78.576 Unlink 2561827 0.559 87.228 Deltree 374 11.499 73.549 Mkdir 187 0.003 0.005 Qpathinfo 11500300 0.061 36.801 Qfileinfo 2017118 0.002 7.189 Qfsinfo 2108641 0.003 4.825 Sfileinfo 1033574 0.008 8.065 Find 4446553 0.408 47.835 WriteX 6335667 0.045 84.388 ReadX 19887312 0.003 9.215 LockX 41312 0.003 1.394 UnlockX 41312 0.002 1.425 Flush 889233 13.014 623.259 Throughput 1339.32 MB/sec 64 clients 64 procs max_latency=623.265 ms +12.7% throughput, -8.2% max latency Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/tree-log.c | 56 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a0c588e5fc89..331554fd6fd1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1038,7 +1038,7 @@ enum { BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, - /* The root has a log tree. Used only for subvolume roots. */ + /* The root has a log tree. Used for subvolume roots and the tree root. */ BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 582911a36ee0..254c2ee43aae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,8 +138,25 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *tree_root = fs_info->tree_root; int ret = 0; + /* + * First check if the log root tree was already created. If not, create + * it before locking the root's log_mutex, just to keep lockdep happy. + */ + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { + mutex_lock(&tree_root->log_mutex); + if (!fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, fs_info); + if (!ret) + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + } + mutex_unlock(&tree_root->log_mutex); + if (ret) + return ret; + } + mutex_lock(&root->log_mutex); if (root->log_root) { @@ -155,13 +172,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { - mutex_lock(&fs_info->tree_log_mutex); - if (!fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, fs_info); - mutex_unlock(&fs_info->tree_log_mutex); - if (ret) - goto out; - ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -3021,6 +3031,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; + u64 log_root_start; + u64 log_root_level; mutex_lock(&root->log_mutex); log_transid = ctx->log_transid; @@ -3198,22 +3210,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - btrfs_set_super_log_root(fs_info->super_for_commit, - log_root_tree->node->start); - btrfs_set_super_log_root_level(fs_info->super_for_commit, - btrfs_header_level(log_root_tree->node)); - + log_root_start = log_root_tree->node->start; + log_root_level = btrfs_header_level(log_root_tree->node); log_root_tree->log_transid++; mutex_unlock(&log_root_tree->log_mutex); /* - * Nobody else is going to jump in and write the ctree - * super here because the log_commit atomic below is protecting - * us. We must be called with a transaction handle pinning - * the running transaction open, so a full commit can't hop - * in and cause problems either. + * Here we are guaranteed that nobody is going to write the superblock + * for the current transaction before us and that neither we do write + * our superblock before the previous transaction finishes its commit + * and writes its superblock, because: + * + * 1) We are holding a handle on the current transaction, so no body + * can commit it until we release the handle; + * + * 2) Before writing our superblock we acquire the tree_log_mutex, so + * if the previous transaction is still committing, and hasn't yet + * written its superblock, we wait for it to do it, because a + * transaction commit acquires the tree_log_mutex when the commit + * begins and releases it only after writing its superblock. */ + mutex_lock(&fs_info->tree_log_mutex); + btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); + btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); + mutex_unlock(&fs_info->tree_log_mutex); if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); @@ -3298,6 +3319,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, if (fs_info->log_root_tree) { free_log_tree(trans, fs_info->log_root_tree); fs_info->log_root_tree = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); } return 0; } -- cgit v1.2.3-59-g8ed1b