aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c39
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/check-integrity.c6
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c85
-rw-r--r--fs/btrfs/ctree.h56
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-inode.h4
-rw-r--r--fs/btrfs/delayed-ref.c107
-rw-r--r--fs/btrfs/delayed-ref.h10
-rw-r--r--fs/btrfs/dev-replace.c64
-rw-r--r--fs/btrfs/dev-replace.h8
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/disk-io.c24
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c461
-rw-r--r--fs/btrfs/extent_io.c45
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c32
-rw-r--r--fs/btrfs/extent_map.h4
-rw-r--r--fs/btrfs/file.c45
-rw-r--r--fs/btrfs/free-space-cache.c48
-rw-r--r--fs/btrfs/inode.c147
-rw-r--r--fs/btrfs/ioctl.c18
-rw-r--r--fs/btrfs/qgroup.c455
-rw-r--r--fs/btrfs/qgroup.h8
-rw-r--r--fs/btrfs/ref-verify.c8
-rw-r--r--fs/btrfs/relocation.c74
-rw-r--r--fs/btrfs/scrub.c34
-rw-r--r--fs/btrfs/send.c24
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/tests/extent-io-tests.c10
-rw-r--r--fs/btrfs/tests/extent-map-tests.c4
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/tree-checker.c14
-rw-r--r--fs/btrfs/tree-log.c91
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c117
-rw-r--r--fs/btrfs/volumes.h9
39 files changed, 1353 insertions, 819 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ae750b1574a2..68ebe188446a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
}
struct preftree {
- struct rb_root root;
+ struct rb_root_cached root;
unsigned int count;
};
-#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 }
+#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
struct preftrees {
struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
struct prelim_ref *newref,
struct share_check *sc)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct rb_node **p;
struct rb_node *parent = NULL;
struct prelim_ref *ref;
int result;
+ bool leftmost = true;
root = &preftree->root;
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
while (*p) {
parent = *p;
@@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
p = &(*p)->rb_left;
} else if (result > 0) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
/* Identical refs, merge them and free @newref */
struct extent_inode_elem *eie = ref->inode_list;
@@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
- rb_insert_color(&newref->rbnode, root);
+ rb_insert_color_cached(&newref->rbnode, root, leftmost);
}
/*
@@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree)
{
struct prelim_ref *ref, *next_ref;
- rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
- rbnode)
+ rbtree_postorder_for_each_entry_safe(ref, next_ref,
+ &preftree->root.rb_root, rbnode)
free_pref(ref);
- preftree->root = RB_ROOT;
+ preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
}
@@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
* freeing the entire indirect tree when we're done. In some test
* cases, the tree can grow quite large (~200k objects).
*/
- while ((rnode = rb_first(&preftrees->indirect.root))) {
+ while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
@@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
goto out;
}
- rb_erase(&ref->rbnode, &preftrees->indirect.root);
+ rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
preftrees->indirect.count--;
if (ref->count == 0) {
@@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
struct preftree *tree = &preftrees->indirect_missing_keys;
struct rb_node *node;
- while ((node = rb_first(&tree->root))) {
+ while ((node = rb_first_cached(&tree->root))) {
ref = rb_entry(node, struct prelim_ref, rbnode);
- rb_erase(node, &tree->root);
+ rb_erase_cached(node, &tree->root);
BUG_ON(ref->parent); /* should not be a direct ref */
BUG_ON(ref->key_for_search.type);
@@ -769,7 +771,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
spin_lock(&head->lock);
- for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+ for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
if (node->seq > seq)
@@ -1229,14 +1231,14 @@ again:
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
extent_item_pos, total_refs, sc, ignore_offset);
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
/*
* This walks the tree of merged and resolved refs. Tree blocks are
@@ -1245,7 +1247,7 @@ again:
*
* We release the entire tree in one go before returning.
*/
- node = rb_first(&preftrees.direct.root);
+ node = rb_first_cached(&preftrees.direct.root);
while (node) {
ref = rb_entry(node, struct prelim_ref, rbnode);
node = rb_next(&ref->rbnode);
@@ -1468,7 +1470,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
struct seq_list elem = SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
- .root_objectid = root->objectid,
+ .root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
};
@@ -2031,7 +2033,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
- cur, found_key.objectid, fs_root->objectid);
+ cur, found_key.objectid,
+ fs_root->root_key.objectid);
ret = iterate(parent, name_len,
(unsigned long)(iref + 1), eb, ctx);
if (ret)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..97d91e55b70a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
{
- u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+ u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
#if BITS_PER_LONG == 32
h = (h >> 32) ^ (h & 0xffffffff);
@@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
/* Output minus objectid, which is more meaningful */
- if (root->objectid >= BTRFS_LAST_FREE_OBJECTID)
+ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
else
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 833cf3c35b4d..2e43fba44035 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
{
unsigned int num_pages;
unsigned int i;
+ size_t size;
u64 dev_bytenr;
int ret;
@@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
- block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) +
- sizeof(*block_ctx->pagev),
- num_pages, GFP_NOFS);
+ size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
+ block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
if (!block_ctx->mem_to_free)
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9bfa66592aa7..2955a4ea2fa8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, pg_index);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
@@ -528,7 +526,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree;
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned long compressed_len;
@@ -545,7 +542,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int faili = 0;
u32 *sums;
- tree = &BTRFS_I(inode)->io_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d436fb4c002e..539901fb5165 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
}
}
-/*
- * reset all the locked nodes in the patch to spinning locks.
- *
- * held is used to keep lockdep happy, when lockdep is enabled
- * we set held to a blocking lock before we go around and
- * retake all the spinlocks in the path. You can safely use NULL
- * for held
- */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw)
-{
- int i;
-
- if (held) {
- btrfs_set_lock_blocking_rw(held, held_rw);
- if (held_rw == BTRFS_WRITE_LOCK)
- held_rw = BTRFS_WRITE_LOCK_BLOCKING;
- else if (held_rw == BTRFS_READ_LOCK)
- held_rw = BTRFS_READ_LOCK_BLOCKING;
- }
- btrfs_set_path_blocking(p);
-
- for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
- if (p->nodes[i] && p->locks[i]) {
- btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
- p->locks[i] = BTRFS_WRITE_LOCK;
- else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
- p->locks[i] = BTRFS_READ_LOCK;
- }
- }
-
- if (held)
- btrfs_clear_lock_blocking_rw(held, held_rw);
-}
-
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -207,7 +171,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
- if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
@@ -1050,9 +1014,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
+ /*
+ * If we are COWing a node/leaf from the extent, chunk or device trees,
+ * make sure that we do not finish block group creation of pending block
+ * groups. We do this to avoid a deadlock.
+ * COWing can result in allocation of a new chunk, and flushing pending
+ * block groups (btrfs_create_pending_block_groups()) can be triggered
+ * when finishing allocation of a new chunk. Creation of a pending block
+ * group modifies the extent, chunk and device trees, therefore we could
+ * deadlock with ourselves since we are holding a lock on an extent
+ * buffer that btrfs_create_pending_block_groups() may try to COW later.
+ */
+ if (root == fs_info->extent_root ||
+ root == fs_info->chunk_root ||
+ root == fs_info->dev_root)
+ trans->can_flush_pending_bgs = false;
+
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size);
+ trans->can_flush_pending_bgs = true;
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1306,7 +1287,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
}
}
- btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1815,8 +1795,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
int orig_slot = path->slots[level];
u64 orig_ptr;
- if (level == 0)
- return 0;
+ ASSERT(level > 0);
mid = path->nodes[level];
@@ -2483,7 +2462,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = split_node(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(sret > 0);
if (sret) {
@@ -2504,7 +2482,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = balance_level(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
if (sret) {
ret = sret;
@@ -2789,7 +2766,10 @@ again:
}
cow_done:
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
+ /*
+ * Leave path with blocking locks to avoid massive
+ * lock context switch, this is made on purpose.
+ */
/*
* we have a lock on b and as long as we aren't changing
@@ -2871,8 +2851,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_WRITE_LOCK);
}
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
@@ -2880,8 +2858,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -2900,7 +2876,6 @@ cow_done:
btrfs_set_path_blocking(p);
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(err > 0);
if (err) {
@@ -2910,7 +2885,7 @@ cow_done:
}
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
+ min_write_lock_level, NULL);
goto done;
}
}
@@ -2961,13 +2936,16 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
level = btrfs_header_level(b);
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
/*
* we have a lock on b and as long as we aren't changing
@@ -3013,8 +2991,6 @@ again:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
b = tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
@@ -5198,7 +5174,6 @@ find_next_key:
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1, 0, NULL);
- btrfs_clear_path_blocking(path, NULL, 0);
}
out:
path->keep_locks = keep_locks;
@@ -5783,8 +5758,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5820,8 +5793,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..68ca41dbbef3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,12 +41,6 @@ extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define STATIC noinline
-#else
-#define STATIC static noinline
-#endif
-
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
@@ -367,11 +361,13 @@ struct btrfs_dev_replace {
struct mutex lock_finishing_cancel_unmount;
rwlock_t lock;
- atomic_t read_locks;
atomic_t blocking_readers;
wait_queue_head_t read_lock_wq;
struct btrfs_scrub_progress scrub_progress;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
};
/* For raid type sysfs entries */
@@ -1094,9 +1090,6 @@ struct btrfs_fs_info {
/* device replace state */
struct btrfs_dev_replace dev_replace;
- struct percpu_counter bio_counter;
- wait_queue_head_t replace_wait;
-
struct semaphore uuid_tree_rescan_sem;
/* Used to reclaim the metadata space in the background. */
@@ -1202,18 +1195,12 @@ struct btrfs_root {
int last_log_commit;
pid_t log_start_pid;
- u64 objectid;
u64 last_trans;
u32 type;
u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
- u64 alloc_bytenr;
-#endif
-
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1286,6 +1273,10 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ u64 alloc_bytenr;
+#endif
};
struct btrfs_file_private {
@@ -2607,10 +2598,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2771,7 +2760,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
- int update_size);
+ bool update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
@@ -2877,8 +2866,6 @@ void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw);
void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3021,8 +3008,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
int name_len, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
@@ -3180,8 +3166,8 @@ void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end, int create);
+ struct page *page, size_t pg_offset,
+ u64 start, u64 end, int create);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@@ -3201,9 +3187,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
-#endif
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3716,18 +3699,19 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-#endif
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- &fs_info->fs_state)))
- return 1;
-#endif
+ return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+#else
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
return 0;
}
+#endif
static inline void cond_wake_up(struct wait_queue_head *wq)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f51b509f2d9b..c669f250d4a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
refcount_set(&delayed_node->refs, 0);
- delayed_node->ins_root = RB_ROOT;
- delayed_node->del_root = RB_ROOT;
+ delayed_node->ins_root = RB_ROOT_CACHED;
+ delayed_node->del_root = RB_ROOT_CACHED;
mutex_init(&delayed_node->mutex);
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
@@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node,
struct btrfs_key *key)
{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+ return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
NULL, NULL);
}
@@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_item *item;
int cmp;
+ bool leftmost = true;
if (action == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
@@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
root = &delayed_node->del_root;
else
BUG();
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
node = &ins->rb_node;
while (*p) {
@@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_node);
cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0)
+ if (cmp < 0) {
p = &(*p)->rb_right;
- else if (cmp > 0)
+ leftmost = false;
+ } else if (cmp > 0) {
p = &(*p)->rb_left;
- else
+ } else {
return -EEXIST;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
ins->delayed_node = delayed_node;
ins->ins_or_del = action;
@@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
@@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
else
root = &delayed_item->delayed_node->del_root;
- rb_erase(&delayed_item->rb_node, root);
+ rb_erase_cached(&delayed_item->rb_node, root);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->ins_root);
+ p = rb_first_cached(&delayed_node->ins_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->del_root);
+ p = rb_first_cached(&delayed_node->del_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -559,7 +562,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
* reserved space when starting a transaction. So no need to reserve
* qgroup space here.
*/
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid,
@@ -647,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
}
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
btrfs_ino(inode), num_bytes, 1);
@@ -762,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
i++;
}
- /* reset all the locked nodes in the patch to spinning locks. */
- btrfs_clear_path_blocking(path, NULL, 0);
-
/* insert the keys of the items */
setup_items_for_insert(root, path, keys, data_size,
total_data_size, total_size, nitems);
@@ -1462,7 +1462,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- name_len, name, delayed_node->root->objectid,
+ name_len, name, delayed_node->root->root_key.objectid,
delayed_node->inode_id, ret);
BUG();
}
@@ -1533,7 +1533,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, node->root->objectid, node->inode_id, ret);
+ index, node->root->root_key.objectid,
+ node->inode_id, ret);
BUG();
}
mutex_unlock(&node->mutex);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 33536cd681d4..74ae226ffaf0 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -50,8 +50,8 @@ struct btrfs_delayed_node {
* is waiting to be dealt with by the async worker.
*/
struct list_head p_list;
- struct rb_root ins_root;
- struct rb_root del_root;
+ struct rb_root_cached ins_root;
+ struct rb_root_cached del_root;
struct mutex mutex;
struct btrfs_inode_item inode_item;
refcount_t refs;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 62ff545ba1f7..9301b3ad9217 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
}
/* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
struct rb_node *node)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_head *entry;
struct btrfs_delayed_ref_head *ins;
u64 bytenr;
+ bool leftmost = true;
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
bytenr = ins->bytenr;
@@ -117,26 +118,29 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
href_node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
-static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *node = &ins->ref_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_node *entry;
+ bool leftmost = true;
while (*p) {
int comp;
@@ -145,29 +149,46 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
ref_node);
comp = comp_refs(ins, entry, true);
- if (comp < 0)
+ if (comp < 0) {
p = &(*p)->rb_left;
- else if (comp > 0)
+ } else if (comp > 0) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+ struct btrfs_delayed_ref_root *dr)
+{
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = rb_first_cached(&dr->href_root);
+ if (!n)
+ return NULL;
+
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ return entry;
+}
+
/*
- * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot.
- * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found.
+ * Find a head entry based on bytenr. This returns the delayed ref head if it
+ * was able to find one, or NULL if nothing was in that spot. If return_bigger
+ * is given, the next bigger entry is returned if no exact match is found.
*/
-static struct btrfs_delayed_ref_head *
-find_ref_head(struct rb_root *root, u64 bytenr,
- int return_bigger)
+static struct btrfs_delayed_ref_head *find_ref_head(
+ struct btrfs_delayed_ref_root *dr, u64 bytenr,
+ bool return_bigger)
{
+ struct rb_root *root = &dr->href_root.rb_root;
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
@@ -187,22 +208,18 @@ find_ref_head(struct rb_root *root, u64 bytenr,
if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
- n = rb_first(root);
+ return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
- return entry;
}
return entry;
}
return NULL;
}
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- struct btrfs_delayed_ref_root *delayed_refs;
-
- delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
return 0;
@@ -227,7 +244,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref)
{
lockdep_assert_held(&head->lock);
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -296,7 +313,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
lockdep_assert_held(&head->lock);
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return;
/* We don't have too many refs to merge for data. */
@@ -314,7 +331,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&fs_info->tree_mod_seq_lock);
again:
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
@@ -345,40 +363,29 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
return ret;
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs)
{
- struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
- u64 start;
- bool loop = false;
-
- delayed_refs = &trans->transaction->delayed_refs;
again:
- start = delayed_refs->run_delayed_start;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
- if (!head && !loop) {
+ head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+ true);
+ if (!head && delayed_refs->run_delayed_start != 0) {
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
- if (!head)
- return NULL;
- } else if (!head && loop) {
- return NULL;
+ head = find_first_ref_head(delayed_refs);
}
+ if (!head)
+ return NULL;
while (head->processing) {
struct rb_node *node;
node = rb_next(&head->href_node);
if (!node) {
- if (loop)
+ if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -569,7 +576,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
head_ref->is_system = is_system;
- head_ref->ref_tree = RB_ROOT;
+ head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
@@ -903,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
- return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ return find_ref_head(delayed_refs, bytenr, false);
}
void __cold btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d9f2a4ebd5db..8e20c5cb5404 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_tree;
+ struct rb_root_cached ref_tree;
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
struct list_head ref_add_list;
@@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_root {
/* head ref rbtree */
- struct rb_root href_root;
+ struct rb_root_cached href_root;
/* dirty extent records */
struct rb_root dirty_extent_root;
@@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
@@ -263,8 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dec01970d8c5..2aa48aecc52b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -382,14 +382,6 @@ out:
return ret;
}
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-
- dev_replace->committed_cursor_left =
- dev_replace->cursor_left_last_write_of_item;
-}
-
static char* btrfs_dev_name(struct btrfs_device *device)
{
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
@@ -408,11 +400,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
+ bool need_unlock;
- ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
- srcdev_name, &src_device);
- if (ret)
- return ret;
+ src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
+ srcdev_name);
+ if (IS_ERR(src_device))
+ return PTR_ERR(src_device);
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
@@ -432,6 +425,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -440,6 +434,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
@@ -470,6 +465,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
btrfs_dev_replace_write_unlock(dev_replace);
+ need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -481,7 +477,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
goto leave;
}
@@ -503,9 +504,8 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
leave:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
- btrfs_dev_replace_write_unlock(dev_replace);
+ if (need_unlock)
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@@ -545,8 +545,8 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wait_event(fs_info->replace_wait, !percpu_counter_sum(
- &fs_info->bio_counter));
+ wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
+ &fs_info->dev_replace.bio_counter));
}
/*
@@ -555,7 +555,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->dev_replace.replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -961,13 +961,10 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
{
read_lock(&dev_replace->lock);
- atomic_inc(&dev_replace->read_locks);
}
void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- atomic_dec(&dev_replace->read_locks);
read_unlock(&dev_replace->lock);
}
@@ -985,7 +982,6 @@ again:
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
write_unlock(&dev_replace->lock);
}
@@ -994,45 +990,31 @@ void btrfs_dev_replace_set_lock_blocking(
struct btrfs_dev_replace *dev_replace)
{
/* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
atomic_inc(&dev_replace->blocking_readers);
read_unlock(&dev_replace->lock);
}
-/* acquire read lock and dec blocking cnt */
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace)
-{
- /* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
- read_lock(&dev_replace->lock);
- /* Barrier implied by atomic_dec_and_test */
- if (atomic_dec_and_test(&dev_replace->blocking_readers))
- cond_wake_up_nomb(&dev_replace->read_lock_wq);
-}
-
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
- percpu_counter_sub(&fs_info->bio_counter, amount);
- cond_wake_up_nomb(&fs_info->replace_wait);
+ percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
+ cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
while (1) {
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state)))
break;
btrfs_bio_counter_dec(fs_info);
- wait_event(fs_info->replace_wait,
+ wait_event(fs_info->dev_replace.replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index b6d4206188bb..795c551f5b5e 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -11,7 +11,6 @@ struct btrfs_ioctl_dev_replace_args;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
@@ -28,12 +27,5 @@ void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace);
-
-static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
-{
- atomic64_inc(stat_value);
-}
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a678b07fcf01..8de74d835dba 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -105,13 +105,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, const char *name, int name_len,
- struct btrfs_inode *dir, struct btrfs_key *location,
- u8 type, u64 index)
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
+ int name_len, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
+ struct btrfs_root *root = dir->root;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *leaf;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 05dc3c17cb62..b0ab41da91d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -125,8 +125,8 @@ struct async_submit_bio {
* Different roots are used for different purposes and may nest inside each
* other and they require separate keysets. As lockdep keys should be
* static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->objectid. This ensures that all special purpose roots
- * have separate keysets.
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
*
* Lock-nesting across peer nodes is always done with the immediate parent
* node locked thus preventing deadlock. As lockdep doesn't know this, use
@@ -1148,7 +1148,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
root->orphan_cleanup_state = 0;
- root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
root->nr_delalloc_inodes = 0;
@@ -2156,9 +2155,8 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
rwlock_init(&fs_info->dev_replace.lock);
- atomic_set(&fs_info->dev_replace.read_locks, 0);
atomic_set(&fs_info->dev_replace.blocking_readers, 0);
- init_waitqueue_head(&fs_info->replace_wait);
+ init_waitqueue_head(&fs_info->dev_replace.replace_wait);
init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
@@ -2648,7 +2646,8 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
@@ -3309,7 +3308,7 @@ fail_iput:
iput(fs_info->btree_inode);
fail_bio_counter:
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
@@ -3977,6 +3976,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
+ ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
btrfs_free_qgroup_config(fs_info);
@@ -4018,7 +4018,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
@@ -4204,7 +4204,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
- while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+ while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
bool pin_bytes = false;
@@ -4222,11 +4222,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((n = rb_first(&head->ref_tree)) != NULL) {
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -4240,7 +4240,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->processing == 0)
delayed_refs->num_heads_ready--;
atomic_dec(&delayed_refs->num_entries);
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1f3755b3a37a..ddf28ecf17f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -33,7 +33,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = btrfs_ino(BTRFS_I(inode));
- fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
fid->gen = inode->i_generation;
if (parent) {
@@ -41,7 +41,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
- parent_root_id = BTRFS_I(parent)->root->objectid;
+ parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2d9074295d7f..a1febf155747 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
insert_reserved);
else
BUG();
+ if (ret && insert_reserved)
+ btrfs_pin_extent(trans->fs_info, node->bytenr,
+ node->num_bytes, 1);
return ret;
}
@@ -2374,7 +2377,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return NULL;
/*
@@ -2387,7 +2390,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
- ref = rb_entry(rb_first(&head->ref_tree),
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
@@ -2448,13 +2451,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&head->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
return 1;
}
delayed_refs->num_heads--;
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -2502,102 +2505,66 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * Returns 0 on success or if called with an already aborted transaction.
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
- */
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long nr)
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_delayed_ref_head *head = NULL;
+ int ret;
+
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_select_ref_head(delayed_refs);
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ return head;
+ }
+
+ /*
+ * Grab the lock that says we are going to process all the refs for
+ * this head
+ */
+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN)
+ head = ERR_PTR(-EAGAIN);
+
+ return head;
+}
+
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *locked_ref,
+ unsigned long *run_refs)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
- ktime_t start = ktime_get();
- int ret;
- unsigned long count = 0;
- unsigned long actual_count = 0;
+ struct btrfs_delayed_ref_node *ref;
int must_insert_reserved = 0;
+ int ret;
delayed_refs = &trans->transaction->delayed_refs;
- while (1) {
- if (!locked_ref) {
- if (count >= nr)
- break;
- spin_lock(&delayed_refs->lock);
- locked_ref = btrfs_select_ref_head(trans);
- if (!locked_ref) {
- spin_unlock(&delayed_refs->lock);
- break;
- }
-
- /* grab the lock that says we are going to process
- * all the refs for this head */
- ret = btrfs_delayed_ref_lock(trans, locked_ref);
- spin_unlock(&delayed_refs->lock);
- /*
- * we may have dropped the spin lock to get the head
- * mutex lock, and that might have given someone else
- * time to free the head. If that's true, it has been
- * removed from our list and we can move on.
- */
- if (ret == -EAGAIN) {
- locked_ref = NULL;
- count++;
- continue;
- }
- }
+ lockdep_assert_held(&locked_ref->mutex);
+ lockdep_assert_held(&locked_ref->lock);
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- * Or we can get node references of the same type that weren't
- * merged when created due to bumps in the tree mod seq, and
- * we need to merge them to prevent adding an inline extent
- * backref before dropping it (triggering a BUG_ON at
- * insert_inline_extent_backref()).
- */
- spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-
- ref = select_delayed_ref(locked_ref);
-
- if (ref && ref->seq &&
+ while ((ref = select_delayed_ref(locked_ref))) {
+ if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref);
- locked_ref = NULL;
- cond_resched();
- count++;
- continue;
- }
-
- /*
- * We're done processing refs in this ref_head, clean everything
- * up and move on to the next ref_head.
- */
- if (!ref) {
- ret = cleanup_ref_head(trans, locked_ref);
- if (ret > 0 ) {
- /* We dropped our lock, we need to loop. */
- ret = 0;
- continue;
- } else if (ret) {
- return ret;
- }
- locked_ref = NULL;
- count++;
- continue;
+ return -EAGAIN;
}
- actual_count++;
+ (*run_refs)++;
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -2619,8 +2586,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
atomic_dec(&delayed_refs->num_entries);
/*
- * Record the must-insert_reserved flag before we drop the spin
- * lock.
+ * Record the must_insert_reserved flag before we drop the
+ * spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
locked_ref->must_insert_reserved = 0;
@@ -2642,10 +2609,90 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
btrfs_put_delayed_ref(ref);
- count++;
cond_resched();
+
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
}
+ return 0;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ unsigned long nr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
+ ktime_t start = ktime_get();
+ int ret;
+ unsigned long count = 0;
+ unsigned long actual_count = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ do {
+ if (!locked_ref) {
+ locked_ref = btrfs_obtain_ref_head(trans);
+ if (IS_ERR_OR_NULL(locked_ref)) {
+ if (PTR_ERR(locked_ref) == -EAGAIN) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ count++;
+ }
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+ &actual_count);
+ if (ret < 0 && ret != -EAGAIN) {
+ /*
+ * Error, btrfs_run_delayed_refs_for_head already
+ * unlocked everything so just bail out
+ */
+ return ret;
+ } else if (!ret) {
+ /*
+ * Success, perform the usual cleanup of a processed
+ * head
+ */
+ ret = cleanup_ref_head(trans, locked_ref);
+ if (ret > 0 ) {
+ /* We dropped our lock, we need to loop. */
+ ret = 0;
+ continue;
+ } else if (ret) {
+ return ret;
+ }
+ }
+
+ /*
+ * Either success case or btrfs_run_delayed_refs_for_head
+ * returned -EAGAIN, meaning we need to select another head
+ */
+
+ locked_ref = NULL;
+ cond_resched();
+ } while ((nr != -1 && count < nr) || locked_ref);
+
/*
* We don't want to include ref heads since we can have empty ref heads
* and those will drastically skew our runtime down since we just do
@@ -2745,9 +2792,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
return num_csums;
}
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
@@ -2782,8 +2829,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
{
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
@@ -2791,14 +2837,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
u64 val;
smp_mb();
- avg_runtime = fs_info->avg_delayed_ref_runtime;
+ avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
if (val >= NSEC_PER_SEC)
return 1;
if (val >= NSEC_PER_SEC / 2)
return 2;
- return btrfs_check_space_for_delayed_refs(trans, fs_info);
+ return btrfs_check_space_for_delayed_refs(trans);
}
struct async_delayed_refs {
@@ -2911,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2928,7 +2973,6 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -2940,7 +2984,7 @@ again:
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- node = rb_first(&delayed_refs->href_root);
+ node = rb_first_cached(&delayed_refs->href_root);
if (!node) {
spin_unlock(&delayed_refs->lock);
goto out;
@@ -2959,7 +3003,6 @@ again:
goto again;
}
out:
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3040,7 +3083,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
* XXX: We should replace this with a proper search function in the
* future.
*/
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -3139,7 +3183,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
{
struct btrfs_path *path;
int ret;
- int ret2;
path = btrfs_alloc_path();
if (!path)
@@ -3151,17 +3194,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
if (ret && ret != -ENOENT)
goto out;
- ret2 = check_delayed_ref(root, path, objectid,
- offset, bytenr);
- } while (ret2 == -EAGAIN);
-
- if (ret2 && ret2 != -ENOENT) {
- ret = ret2;
- goto out;
- }
+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+ } while (ret == -EAGAIN);
- if (ret != -ENOENT || ret2 != -ENOENT)
- ret = 0;
out:
btrfs_free_path(path);
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -4533,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
goto out;
} else {
ret = 1;
+ space_info->max_extent_size = 0;
}
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -4554,11 +4590,9 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (u64)SZ_2M) {
+ if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
btrfs_create_pending_block_groups(trans);
- btrfs_trans_release_chunk_metadata(trans);
- }
+
return ret;
}
@@ -5284,7 +5318,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
}
static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int update_size)
+ u64 num_bytes, bool update_size)
{
spin_lock(&block_rsv->lock);
block_rsv->reserved += num_bytes;
@@ -5316,7 +5350,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
global_rsv->full = 0;
spin_unlock(&global_rsv->lock);
- block_rsv_add_bytes(dest, num_bytes, 1);
+ block_rsv_add_bytes(dest, num_bytes, true);
return 0;
}
@@ -5479,7 +5513,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
struct btrfs_block_rsv *dst, u64 num_bytes,
- int update_size)
+ bool update_size)
{
int ret;
@@ -5539,10 +5573,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
return 0;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
- return 0;
- }
+ if (!ret)
+ block_rsv_add_bytes(block_rsv, num_bytes, true);
return ret;
}
@@ -5587,7 +5619,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
}
@@ -5629,7 +5661,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), num_bytes, 1);
@@ -5835,7 +5867,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
@@ -6399,10 +6431,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
-
- trace_btrfs_space_reservation(cache->fs_info,
- "space_info", space_info->flags,
- ram_bytes, 0);
space_info->bytes_may_use -= ram_bytes;
if (delalloc)
cache->delalloc_bytes += num_bytes;
@@ -6424,11 +6452,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
* reserve set to 0 in order to clear the reservation.
*/
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int delalloc)
+static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
- int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
@@ -6436,12 +6463,12 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- return ret;
}
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
{
@@ -6925,7 +6952,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree))
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
goto out;
if (head->extent_op) {
@@ -6946,7 +6973,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
* at this point we have a head with no other entries. Go
* ahead and process it.
*/
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
@@ -7233,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
+ u64 max_free_space = 0;
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
@@ -7528,8 +7556,8 @@ unclustered_alloc:
spin_lock(&ctl->tree_lock);
if (ctl->free_space <
num_bytes + empty_cluster + empty_size) {
- if (ctl->free_space > max_extent_size)
- max_extent_size = ctl->free_space;
+ max_free_space = max(max_free_space,
+ ctl->free_space);
spin_unlock(&ctl->tree_lock);
goto loop;
}
@@ -7696,6 +7724,8 @@ loop:
}
out:
if (ret == -ENOSPC) {
+ if (!max_extent_size)
+ max_extent_size = max_free_space;
spin_lock(&space_info->lock);
space_info->max_extent_size = max_extent_size;
spin_unlock(&space_info->lock);
@@ -7977,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
+ if (!path)
return -ENOMEM;
- }
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
return ret;
}
@@ -8119,6 +8142,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
+ /*
+ * Extra safety check in case the extent tree is corrupted and extent
+ * allocator chooses to use a tree block which is already used and
+ * locked.
+ */
+ if (buf->lock_owner == current->pid) {
+ btrfs_err_rl(fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ buf->start, btrfs_header_owner(buf), current->pid);
+ free_extent_buffer(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(fs_info, buf);
@@ -8215,7 +8251,7 @@ try_reserve:
static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
- block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_add_bytes(block_rsv, blocksize, false);
block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
}
@@ -8642,7 +8678,13 @@ skip:
parent = 0;
}
- if (need_account) {
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
+ * already accounted them at merge time (replace_path),
+ * thus we could skip expensive subtree trace here.
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ need_account) {
ret = btrfs_qgroup_trace_subtree(trans, next,
generation, level - 1);
if (ret) {
@@ -8763,15 +8805,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(eb));
+ else if (root->root_key.objectid != btrfs_header_owner(eb))
+ goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level + 1]));
+ else if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]))
+ goto owner_mismatch;
}
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
@@ -8779,6 +8820,11 @@ out:
wc->refs[level] = 0;
wc->flags[level] = 0;
return 0;
+
+owner_mismatch:
+ btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
+ btrfs_header_owner(eb), root->root_key.objectid);
+ return -EUCLEAN;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -8832,6 +8878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
ret = walk_up_proc(trans, root, path, wc);
if (ret > 0)
return 0;
+ if (ret < 0)
+ return ret;
if (path->locks[level]) {
btrfs_tree_unlock_rw(path->nodes[level],
@@ -8875,7 +8923,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
path = btrfs_alloc_path();
if (!path) {
@@ -9613,6 +9661,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group = btrfs_lookup_first_block_group(info, last);
while (block_group) {
+ wait_block_group_cache_done(block_group);
spin_lock(&block_group->lock);
if (block_group->iref)
break;
@@ -10074,15 +10123,19 @@ error:
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group_cache *block_group;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
- trans->can_flush_pending_bgs = false;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ if (!trans->can_flush_pending_bgs)
+ return;
+
+ while (!list_empty(&trans->new_bgs)) {
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
if (ret)
goto next;
@@ -10103,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
next:
list_del_init(&block_group->bg_list);
}
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
+ btrfs_trans_release_chunk_metadata(trans);
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
@@ -10753,14 +10806,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
* We don't want a transaction for this since the discard may take a
* substantial amount of time. We don't require that a transaction be
* running, but we do need to take a running transaction into account
- * to ensure that we're not discarding chunks that were released in
- * the current transaction.
+ * to ensure that we're not discarding chunks that were released or
+ * allocated in the current transaction.
*
* Holding the chunks lock will prevent other threads from allocating
* or releasing chunks, but it won't prevent a running transaction
* from committing and releasing the memory that the pending chunks
* list head uses. For that, we need to take a reference to the
- * transaction.
+ * transaction and hold the commit root sem. We only need to hold
+ * it while performing the free space search since we have already
+ * held back allocations.
*/
static int btrfs_trim_free_extents(struct btrfs_device *device,
u64 minlen, u64 *trimmed)
@@ -10770,6 +10825,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
+ /* Discard not supported = nothing to do. */
+ if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ return 0;
+
/* Not writeable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
@@ -10787,9 +10846,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
- return ret;
+ break;
- down_read(&fs_info->commit_root_sem);
+ ret = down_read_killable(&fs_info->commit_root_sem);
+ if (ret) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ break;
+ }
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
@@ -10797,13 +10860,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+
ret = find_free_dev_extent_start(trans, device, minlen, start,
&start, &len);
- if (trans)
+ if (trans) {
+ up_read(&fs_info->commit_root_sem);
btrfs_put_transaction(trans);
+ }
if (ret) {
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret == -ENOSPC)
ret = 0;
@@ -10811,7 +10878,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
}
ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
@@ -10831,6 +10897,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
return ret;
}
+/*
+ * Trim the whole filesystem by:
+ * 1) trimming the free space in each block group
+ * 2) trimming the unallocated space on each device
+ *
+ * This will also continue trimming even if a block group or device encounters
+ * an error. The return value will be the last error, or 0 if nothing bad
+ * happens.
+ */
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -10840,18 +10915,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ u64 bg_failed = 0;
+ u64 dev_failed = 0;
+ int bg_ret = 0;
+ int dev_ret = 0;
int ret = 0;
- /*
- * try to trim all FS space, our block group may start from non-zero.
- */
- if (range->len == total_bytes)
- cache = btrfs_lookup_first_block_group(fs_info, range->start);
- else
- cache = btrfs_lookup_block_group(fs_info, range->start);
-
- while (cache) {
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ for (; cache; cache = next_block_group(fs_info, cache)) {
if (cache->key.objectid >= (range->start + range->len)) {
btrfs_put_block_group(cache);
break;
@@ -10865,13 +10936,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
ret = wait_block_group_cache_done(cache);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
ret = btrfs_trim_block_group(cache,
@@ -10882,28 +10955,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
-
- cache = next_block_group(fs_info, cache);
}
+ if (bg_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu block group(s), last error %d",
+ bg_failed, bg_ret);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->alloc_list;
- list_for_each_entry(device, devices, dev_alloc_list) {
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
ret = btrfs_trim_free_extents(device, range->minlen,
&group_trimmed);
- if (ret)
+ if (ret) {
+ dev_failed++;
+ dev_ret = ret;
break;
+ }
trimmed += group_trimmed;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (dev_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu device(s), last error %d",
+ dev_failed, dev_ret);
range->len = trimmed;
- return ret;
+ if (bg_ret)
+ return bg_ret;
+ return dev_ret;
}
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..d228f706ff3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1424,20 +1424,15 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
struct extent_state *state;
- struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->end == start - 1 && extent_state_in_tree(state)) {
- n = rb_next(&state->rb_node);
- while (n) {
- state = rb_entry(n, struct extent_state,
- rb_node);
+ while ((state = next_state(state)) != NULL) {
if (state->state & bits)
goto got_it;
- n = rb_next(n);
}
free_extent_state(*cached_state);
*cached_state = NULL;
@@ -1568,7 +1563,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
-STATIC u64 find_lock_delalloc_range(struct inode *inode,
+static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes)
@@ -1648,6 +1643,17 @@ out_failed:
return found;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes)
+{
+ return find_lock_delalloc_range(inode, tree, locked_page, start, end,
+ max_bytes);
+}
+#endif
+
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
pgoff_t start_index, pgoff_t end_index,
@@ -3778,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -3903,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
- int tag;
+ xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
@@ -5153,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->i_pages,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
@@ -5165,11 +5169,11 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_buffer *eb)
+bool set_extent_buffer_dirty(struct extent_buffer *eb)
{
int i;
int num_pages;
- int was_dirty = 0;
+ bool was_dirty;
check_buffer_tree_ref(eb);
@@ -5179,8 +5183,15 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ if (!was_dirty)
+ for (i = 0; i < num_pages; i++)
+ set_page_dirty(eb->pages[i]);
+
+#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ ASSERT(PageDirty(eb->pages[i]));
+#endif
+
return was_dirty;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b4d03e677e1d..369daa5d4f73 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -479,7 +479,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_buffer *eb);
+bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
@@ -546,7 +546,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-noinline u64 find_lock_delalloc_range(struct inode *inode,
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6648d55e5339..7eea8b6e2cd3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -34,7 +34,7 @@ void __cold extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
- tree->map = RB_ROOT;
+ tree->map = RB_ROOT_CACHED;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static int tree_insert(struct rb_root *root, struct extent_map *em)
+static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
+ bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- if (em->start < entry->start)
+ if (em->start < entry->start) {
p = &(*p)->rb_left;
- else if (em->start >= extent_map_end(entry))
+ } else if (em->start >= extent_map_end(entry)) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return -EEXIST;
+ }
}
orig_parent = parent;
@@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
- rb_insert_color(&em->rb_node, root);
+ rb_insert_color_cached(&em->rb_node, root, leftmost);
return 0;
}
@@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
@@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (rb && mergable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
@@ -367,7 +370,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
if (!rb_node) {
if (prev)
rb_node = prev;
@@ -428,16 +431,13 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
* Removes @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use
*/
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
- int ret = 0;
-
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- rb_erase(&em->rb_node, &tree->map);
+ rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
- return ret;
}
void replace_extent_mapping(struct extent_map_tree *tree,
@@ -449,7 +449,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
list_del_init(&cur->list);
- rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
setup_extent_mapping(tree, new, modified);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 25d985e7532a..31977ffd6190 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -49,7 +49,7 @@ struct extent_map {
};
struct extent_map_tree {
- struct rb_root map;
+ struct rb_root_cached map;
struct list_head modified_extents;
rwlock_t lock;
};
@@ -78,7 +78,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *cur,
struct extent_map *new,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..97c7a086f7bd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
end_of_last_block = start_pos + num_bytes - 1;
+ /*
+ * The pages may have already been dirty, clear out old accounting so
+ * we can set things up properly
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+
if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
if (start_pos >= isize &&
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
- clear_extent_bit(&inode->io_tree, start_pos, last_pos,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state);
+
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
}
+ /*
+ * It's possible the pages are dirty right now, but we don't want
+ * to clean them yet because copy_from_user may catch a page fault
+ * and we might have to fall back to one page at a time. If that
+ * happens, we'll unlock these pages and we'd have a window where
+ * reclaim could sneak in and drop the once-dirty page on the floor
+ * without writing it.
+ *
+ * We have the pages locked and the extent range locked, so there's
+ * no way someone can start IO on any dirty pages in this range.
+ *
+ * We'll call btrfs_dirty_pages() later on, and that will flip around
+ * delalloc bits and dirty the pages as required.
+ */
for (i = 0; i < num_pages; i++) {
- if (clear_page_dirty_for_io(pages[i]))
- account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
@@ -2061,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
inode_lock(inode);
+
+ /*
+ * We take the dio_sem here because the tree log stuff can race with
+ * lockless dio writes and get an extent map logged for an extent we
+ * never waited on. We need it this high up for lockdep reasons.
+ */
+ down_write(&BTRFS_I(inode)->dio_sem);
+
atomic_inc(&root->log_batch);
/*
@@ -2069,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2092,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2110,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2131,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
/*
@@ -2544,7 +2573,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2594,7 +2623,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0adf38b00fa0..4ba0aedc878b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -10,6 +10,7 @@
#include <linux/math64.h>
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct inode *inode = NULL;
+ unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_disk_key_to_cpu(&location, &disk_key);
btrfs_release_path(path);
+ /*
+ * We are often under a trans handle at this point, so we need to make
+ * sure NOFS is set to keep us from deadlocking.
+ */
+ nofs_flag = memalloc_nofs_save();
inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
@@ -1679,6 +1687,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1762,6 +1772,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+ if (entry->bitmap)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1783,8 +1800,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1802,8 +1819,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bytes < *bytes + align_off) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1815,8 +1832,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*offset = tmp;
*bytes = size;
return entry;
- } else if (size > *max_extent_size) {
- *max_extent_size = size;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
}
continue;
}
@@ -2110,8 +2129,7 @@ new_bitmap:
out:
if (info) {
- if (info->bitmap)
- kfree(info->bitmap);
+ kfree(info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, info);
}
@@ -2440,6 +2458,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
struct rb_node *n;
int count = 0;
+ spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
@@ -2448,6 +2467,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
+ spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
@@ -2676,8 +2696,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
- if (search_bytes > *max_extent_size)
- *max_extent_size = search_bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
return 0;
}
@@ -2714,8 +2734,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
- if (entry->bytes < bytes && entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
if (entry->bytes < bytes ||
(!entry->bitmap && entry->offset < min_start)) {
@@ -3601,8 +3622,7 @@ again:
if (info)
kmem_cache_free(btrfs_free_space_cachep, info);
- if (map)
- kfree(map);
+ kfree(map);
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..d3df5b52278c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,7 +64,6 @@ static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
-static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
static const struct extent_io_ops btrfs_extent_io_ops;
@@ -503,6 +502,7 @@ again:
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+ nr_pages = 0;
goto cont;
}
@@ -2750,12 +2750,9 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
struct btrfs_path *path;
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
- struct inode *inode;
struct rb_node *node;
int ret;
- inode = new->inode;
-
path = btrfs_alloc_path();
if (!path)
return;
@@ -2944,6 +2941,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
+ bool clear_reserved_extent = true;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
@@ -3047,10 +3045,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- if (!ret)
+ if (!ret) {
+ clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->start,
ordered_extent->disk_len);
+ }
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -3111,8 +3111,13 @@ out:
* wrong we need to return the space for this ordered extent
* back to the allocator. We only free the extent in the
* truncated case if we didn't write out the extent at all.
+ *
+ * If we made it past insert_reserved_file_extent before we
+ * errored out then we don't need to do this as the accounting
+ * has already been done.
*/
if ((ret || !logical_len) &&
+ clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(fs_info,
@@ -3471,8 +3476,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* this will do delete_inode and everything for us */
iput(inode);
- if (ret)
- goto out;
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3738,7 +3741,7 @@ cache_acl:
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
inode->i_op = &btrfs_special_inode_operations;
@@ -3910,12 +3913,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto err;
- }
- if (!di) {
- ret = -ENOENT;
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
}
leaf = path->nodes[0];
@@ -4075,10 +4074,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
- if (!di)
- ret = -ENOENT;
- else
- ret = PTR_ERR(di);
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -4270,18 +4266,17 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
* again is not run concurrently.
*/
spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- if (dest->send_in_progress == 0) {
- btrfs_set_root_flags(&dest->root_item,
- root_flags | BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- } else {
+ if (dest->send_in_progress) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
return -EPERM;
}
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
down_write(&fs_info->subvol_sem);
@@ -4727,7 +4722,7 @@ delete:
btrfs_abort_transaction(trans, ret);
break;
}
- if (btrfs_should_throttle_delayed_refs(trans, fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
trans->transid, 0);
@@ -4736,8 +4731,7 @@ delete:
extent_num_bytes)) {
should_end = true;
}
- if (btrfs_should_throttle_delayed_refs(trans,
- fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
}
@@ -5235,10 +5229,10 @@ static void evict_inode_truncate_pages(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map)) {
+ while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
struct extent_map *em;
- node = rb_first(&map_tree->map);
+ node = rb_first_cached(&map_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -5274,11 +5268,13 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
+ state_flags = state->state;
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, &cached_state);
@@ -5291,7 +5287,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
- if (state->state & EXTENT_DELALLOC)
+ if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
@@ -5306,8 +5302,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- u64 min_size)
+ struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5317,7 +5312,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5334,8 +5329,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* Try to steal from the global reserve if there is space for
* it.
*/
- if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
- !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+ if (!btrfs_check_space_for_delayed_refs(trans) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false))
return trans;
/* If not, commit and try again. */
@@ -5351,7 +5346,6 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- u64 min_size;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5361,8 +5355,6 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
- min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
-
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
@@ -5373,9 +5365,6 @@ void btrfs_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
- /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
- if (!special_file(inode->i_mode))
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
@@ -5395,13 +5384,13 @@ void btrfs_evict_inode(struct inode *inode)
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
- rsv->size = min_size;
+ rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1);
rsv->failfast = 1;
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
@@ -5426,7 +5415,7 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (!IS_ERR(trans)) {
trans->block_rsv = rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
@@ -5471,12 +5460,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -5790,16 +5775,10 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct inode *inode;
-
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode)) {
- if (PTR_ERR(inode) == -ENOENT)
- inode = NULL;
- else
- return ERR_CAST(inode);
- }
+ struct inode *inode = btrfs_lookup_dentry(dir, dentry);
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
return d_splice_alias(inode, dentry);
}
@@ -6390,8 +6369,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode, &key,
+ ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
btrfs_inode_type(&inode->vfs_inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
@@ -6584,7 +6562,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->objectid != BTRFS_I(inode)->root->objectid)
+ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6777,9 +6755,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* This also copies inline extents directly into the page.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
@@ -6823,19 +6801,21 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
em->len = (u64)-1;
em->block_len = (u64)-1;
+ path = btrfs_alloc_path();
if (!path) {
- path = btrfs_alloc_path();
- if (!path) {
- err = -ENOMEM;
- goto out;
- }
- /*
- * Chances are we'll be called again, so go ahead and do
- * readahead
- */
- path->reada = READA_FORWARD;
+ err = -ENOMEM;
+ goto out;
}
+ /* Chances are we'll be called again, so go ahead and do readahead */
+ path->reada = READA_FORWARD;
+
+ /*
+ * Unless we're going to uncompress the inline extent, no sleep would
+ * happen.
+ */
+ path->leave_spinning = 1;
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
err = ret;
@@ -6938,6 +6918,8 @@ next:
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+
+ btrfs_set_path_blocking(path);
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
@@ -6985,10 +6967,10 @@ insert:
err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
+ btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- btrfs_free_path(path);
if (err) {
free_extent_map(em);
return ERR_PTR(err);
@@ -9021,7 +9003,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
/*
@@ -9058,7 +9040,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_block_rsv_release(fs_info, rsv, -1);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
}
@@ -10191,7 +10173,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
err = btrfs_update_inode(trans, root, inode);
@@ -10567,13 +10549,6 @@ static const struct address_space_operations btrfs_aops = {
.error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations btrfs_symlink_aops = {
- .readpage = btrfs_readpage,
- .writepage = btrfs_writepage,
- .invalidatepage = btrfs_invalidatepage,
- .releasepage = btrfs_releasepage,
-};
-
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d60b6caf09e8..a990a9045139 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes ||
- range.len < fs_info->sb->s_blocksize)
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
+ * block group is in the logical address space, which can be any
+ * sectorsize aligned bytenr in the range [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
- range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
@@ -686,8 +689,7 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- ret = btrfs_insert_dir_item(trans, root,
- name, namelen, BTRFS_I(dir), &key,
+ ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
BTRFS_FT_DIR, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1324,7 +1326,7 @@ again:
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
@@ -4393,7 +4395,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(new_root->objectid)) {
+ if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d4917c0cddf5..45868fd76209 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
if (!qgroup) {
ret = -ENOENT;
goto out;
- } else {
- /* check if there are no children of this qgroup */
- if (!list_empty(&qgroup->members)) {
- ret = -EBUSY;
- goto out;
- }
}
+
+ /* Check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
ret = del_qgroup_item(trans, qgroupid);
if (ret && ret != -ENOENT)
goto out;
@@ -1712,6 +1713,416 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
return 0;
}
+/*
+ * Helper function to trace a subtree tree block swap.
+ *
+ * The swap will happen in highest tree block, but there may be a lot of
+ * tree blocks involved.
+ *
+ * For example:
+ * OO = Old tree blocks
+ * NN = New tree blocks allocated during balance
+ *
+ * File tree (257) Reloc tree for 257
+ * L2 OO NN
+ * / \ / \
+ * L1 OO OO (a) OO NN (a)
+ * / \ / \ / \ / \
+ * L0 OO OO OO OO OO OO NN NN
+ * (b) (c) (b) (c)
+ *
+ * When calling qgroup_trace_extent_swap(), we will pass:
+ * @src_eb = OO(a)
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
+ * @dst_level = 0
+ * @root_level = 1
+ *
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
+ *
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
+ *
+ * 1) Tree search from @src_eb
+ * It should acts as a simplified btrfs_search_slot().
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
+ * (first key).
+ *
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
+ * They should be marked during preivous (@dst_level = 1) iteration.
+ *
+ * 3) Mark file extents in leaves dirty
+ * We don't have good way to pick out new file extents only.
+ * So we still follow the old method by scanning all file extents in
+ * the leave.
+ *
+ * This function can free us from keeping two pathes, thus later we only need
+ * to care about how to iterate all new tree blocks in reloc tree.
+ */
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int dst_level, int root_level,
+ bool trace_leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_path *src_path;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u32 nodesize = fs_info->nodesize;
+ int cur_level = root_level;
+ int ret;
+
+ BUG_ON(dst_level > root_level);
+ /* Level mismatch */
+ if (btrfs_header_level(src_eb) != root_level)
+ return -EINVAL;
+
+ src_path = btrfs_alloc_path();
+ if (!src_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dst_level)
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+ else
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+
+ /* For src_path */
+ extent_buffer_get(src_eb);
+ src_path->nodes[root_level] = src_eb;
+ src_path->slots[root_level] = dst_path->slots[root_level];
+ src_path->locks[root_level] = 0;
+
+ /* A simplified version of btrfs_search_slot() */
+ while (cur_level >= dst_level) {
+ struct btrfs_key src_key;
+ struct btrfs_key dst_key;
+
+ if (src_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ struct extent_buffer *eb;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ eb = src_path->nodes[cur_level + 1];
+ parent_slot = src_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ src_path->nodes[cur_level] = eb;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ }
+
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
+ if (cur_level) {
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ } else {
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ }
+ /* Content mismatch, something went wrong */
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ cur_level--;
+ }
+
+ /*
+ * Now both @dst_path and @src_path have been populated, record the tree
+ * blocks for qgroup accounting.
+ */
+ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_extent(trans,
+ dst_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
+ /* Record leaf file extents */
+ if (dst_level == 0 && trace_leaf) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
+ }
+out:
+ btrfs_free_path(src_path);
+ return ret;
+}
+
+/*
+ * Helper function to do recursive generation-aware depth-first search, to
+ * locate all new tree blocks in a subtree of reloc tree.
+ *
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
+ * reloc tree
+ * L2 NN (a)
+ * / \
+ * L1 OO NN (b)
+ * / \ / \
+ * L0 OO OO OO NN
+ * (c) (d)
+ * If we pass:
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
+ * @cur_level = 1
+ * @root_level = 1
+ *
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
+ * above tree blocks along with their counter parts in file tree.
+ * While during search, old tree blocsk OO(c) will be skiped as tree block swap
+ * won't affect OO(c).
+ */
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int cur_level, int root_level,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *eb;
+ bool need_cleanup = false;
+ int ret = 0;
+ int i;
+
+ /* Level sanity check */
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL ||
+ root_level < cur_level) {
+ btrfs_err_rl(fs_info,
+ "%s: bad levels, cur_level=%d root_level=%d",
+ __func__, cur_level, root_level);
+ return -EUCLEAN;
+ }
+
+ /* Read the tree block if needed */
+ if (dst_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /*
+ * dst_path->nodes[root_level] must be initialized before
+ * calling this function.
+ */
+ if (cur_level == root_level) {
+ btrfs_err_rl(fs_info,
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
+ __func__, root_level, root_level, cur_level);
+ return -EUCLEAN;
+ }
+
+ /*
+ * We need to get child blockptr/gen from parent before we can
+ * read it.
+ */
+ eb = dst_path->nodes[cur_level + 1];
+ parent_slot = dst_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ /* This node is old, no need to trace */
+ if (child_gen < last_snapshot)
+ goto out;
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ dst_path->nodes[cur_level] = eb;
+ dst_path->slots[cur_level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ need_cleanup = true;
+ }
+
+ /* Now record this tree block and its counter part for qgroups */
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
+ root_level, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+
+ eb = dst_path->nodes[cur_level];
+
+ if (cur_level > 0) {
+ /* Iterate all child tree blocks */
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ /* Skip old tree blocks as they won't be swapped */
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
+ continue;
+ dst_path->slots[cur_level] = i;
+
+ /* Recursive call (at most 7 times) */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
+ dst_path, cur_level - 1, root_level,
+ last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (need_cleanup) {
+ /* Clean up */
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
+ dst_path->locks[cur_level]);
+ free_extent_buffer(dst_path->nodes[cur_level]);
+ dst_path->nodes[cur_level] = NULL;
+ dst_path->slots[cur_level] = 0;
+ dst_path->locks[cur_level] = 0;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Inform qgroup to trace subtree swap used in balance.
+ *
+ * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
+ * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
+ *
+ * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
+ * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
+ * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
+ * the conterpart of the tree block, then mark both tree blocks as qgroup dirty,
+ * and skip all tree blocks whose generation is smaller than last_snapshot.
+ *
+ * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
+ * which could be the cause of very slow balance if the file tree is large.
+ *
+ * @src_parent, @src_slot: pointer to src (file tree) eb.
+ * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
+ */
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *dst_path = NULL;
+ struct btrfs_key first_key;
+ struct extent_buffer *src_eb = NULL;
+ struct extent_buffer *dst_eb = NULL;
+ bool trace_leaf = false;
+ u64 child_gen;
+ u64 child_bytenr;
+ int level;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ /* Check parameter order */
+ if (btrfs_node_ptr_generation(src_parent, src_slot) >
+ btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+ btrfs_node_ptr_generation(src_parent, src_slot),
+ btrfs_node_ptr_generation(dst_parent, dst_slot));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Only trace leaf if we're relocating data block groups, this could
+ * reduce tons of data extents tracing for meta/sys bg relocation.
+ */
+ if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
+ trace_leaf = true;
+ /* Read out real @src_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
+ child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
+ btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
+
+ src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(src_parent) - 1, &first_key);
+ if (IS_ERR(src_eb)) {
+ ret = PTR_ERR(src_eb);
+ goto out;
+ }
+
+ /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
+ child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
+ btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
+
+ dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(dst_parent) - 1, &first_key);
+ if (IS_ERR(dst_eb)) {
+ ret = PTR_ERR(dst_eb);
+ goto out;
+ }
+
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ level = btrfs_header_level(dst_eb);
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* For dst_path */
+ extent_buffer_get(dst_eb);
+ dst_path->nodes[level] = dst_eb;
+ dst_path->slots[level] = 0;
+ dst_path->locks[level] = 0;
+
+ /* Do the generation-aware breadth-first search */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+ level, last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ free_extent_buffer(src_eb);
+ free_extent_buffer(dst_eb);
+ btrfs_free_path(dst_path);
+ if (ret < 0)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
@@ -2132,6 +2543,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
struct rb_node *node;
+ u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
@@ -2141,6 +2553,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
+ num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
if (!ret) {
@@ -2186,6 +2599,8 @@ cleanup:
kfree(record);
}
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
+ num_dirty_extents);
return ret;
}
@@ -2897,6 +3312,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
+ qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3004,7 +3420,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
- !is_fstree(root->objectid) || len == 0)
+ !is_fstree(root->root_key.objectid) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -3090,7 +3506,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
BTRFS_QGROUP_RSV_DATA);
ret = freed;
out:
@@ -3106,6 +3522,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
+ &BTRFS_I(inode)->root->fs_info->flags))
+ return 0;
+
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
@@ -3122,7 +3542,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3215,7 +3635,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid) || num_bytes == 0)
+ !is_fstree(root->root_key.objectid) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -3240,13 +3660,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
- btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -3256,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/*
@@ -3267,7 +3687,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
+ num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -3321,13 +3742,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
- qgroup_convert_meta(fs_info, root->objectid, num_bytes);
+ qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
}
/*
@@ -3354,7 +3775,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
inode->i_ino, unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 54b8bb282c0e..d8f78f5ab854 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -236,6 +236,12 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
+
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -249,6 +255,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return;
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
BTRFS_QGROUP_RSV_DATA);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e5b9e596bb92..d69fbfb30aa9 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -732,7 +732,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
INIT_LIST_HEAD(&ra->list);
ra->action = action;
- ra->root = root->objectid;
+ ra->root = root->root_key.objectid;
/*
* This is an allocation, preallocate the block_entry in case we haven't
@@ -787,8 +787,8 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* one we want to lookup below when we modify the
* re->num_refs.
*/
- ref_root = root->objectid;
- re->root_objectid = root->objectid;
+ ref_root = root->root_key.objectid;
+ re->root_objectid = root->root_key.objectid;
re->num_refs = 0;
}
@@ -862,7 +862,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* didn't thik of some other corner case.
*/
btrfs_err(fs_info, "failed to find root %llu for %llu",
- root->objectid, be->bytenr);
+ root->root_key.objectid, be->bytenr);
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ra);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8783a1776540..924116f654a1 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -648,8 +648,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
int level, u64 bytenr)
{
struct backref_cache *cache = &rc->backref_cache;
- struct btrfs_path *path1;
- struct btrfs_path *path2;
+ struct btrfs_path *path1; /* For searching extent root */
+ struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
struct extent_buffer *eb;
struct btrfs_root *root;
struct backref_node *cur;
@@ -662,7 +662,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
struct btrfs_key key;
unsigned long end;
unsigned long ptr;
- LIST_HEAD(list);
+ LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
LIST_HEAD(useless);
int cowonly;
int ret;
@@ -778,6 +778,10 @@ again:
key.type != BTRFS_SHARED_BLOCK_REF_KEY);
}
+ /*
+ * Parent node found and matches current inline ref, no need to
+ * rebuild this node for this inline ref.
+ */
if (exist &&
((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
exist->owner == key.offset) ||
@@ -787,11 +791,12 @@ again:
goto next;
}
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
if (key.objectid == key.offset) {
/*
- * only root blocks of reloc trees use
- * backref of this type.
+ * Only root blocks of reloc trees use backref
+ * pointing to itself.
*/
root = find_reloc_root(rc, cur->bytenr);
ASSERT(root);
@@ -840,7 +845,11 @@ again:
goto next;
}
- /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+ * means the root objectid. We need to search the tree to get
+ * its parent bytenr.
+ */
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
@@ -863,10 +872,7 @@ again:
level = cur->level + 1;
- /*
- * searching the tree to find upper level blocks
- * reference the block.
- */
+ /* Search the tree to find parent blocks referring the block. */
path2->search_commit_root = 1;
path2->skip_locking = 1;
path2->lowest_level = level;
@@ -884,7 +890,8 @@ again:
cur->bytenr) {
btrfs_err(root->fs_info,
"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1, root->objectid,
+ cur->bytenr, level - 1,
+ root->root_key.objectid,
node_key->objectid, node_key->type,
node_key->offset);
err = -ENOENT;
@@ -892,6 +899,8 @@ again:
}
lower = cur;
need_check = true;
+
+ /* Add all nodes and edges in the path */
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
ASSERT(btrfs_root_bytenr(&root->root_item) ==
@@ -1281,7 +1290,7 @@ static void __del_reloc_root(struct btrfs_root *root)
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
- if (rc) {
+ if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
root->node->start);
@@ -1735,7 +1744,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* errors, a negative error number is returned.
*/
static noinline_for_stack
-int replace_path(struct btrfs_trans_handle *trans,
+int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
struct btrfs_root *dest, struct btrfs_root *src,
struct btrfs_path *path, struct btrfs_key *next_key,
int lowest_level, int max_level)
@@ -1879,14 +1888,9 @@ again:
* and tree block numbers, if current trans doesn't free
* data reloc tree inode.
*/
- ret = btrfs_qgroup_trace_subtree(trans, parent,
- btrfs_header_generation(parent),
- btrfs_header_level(parent));
- if (ret < 0)
- break;
- ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level],
- btrfs_header_generation(path->nodes[level]),
- btrfs_header_level(path->nodes[level]));
+ ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
+ parent, slot, path->nodes[level],
+ path->slots[level], last_snapshot);
if (ret < 0)
break;
@@ -2205,7 +2209,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
} else {
- ret = replace_path(trans, root, reloc_root, path,
+ ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
if (ret < 0) {
@@ -2911,7 +2915,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- WARN_ON(btrfs_header_level(eb) != block->level);
if (block->level == 0)
btrfs_item_key_to_cpu(eb, &block->key, 0);
else
@@ -2987,7 +2990,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
- struct rb_node *rb_node;
+ struct tree_block *next;
int ret;
int err = 0;
@@ -2997,29 +3000,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
goto out_free_blocks;
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Kick in readahead for tree blocks with missing keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
readahead_tree_block(fs_info, block->bytenr);
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Get first keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
err = get_tree_block_key(fs_info, block);
if (err)
goto out_free_path;
}
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
-
+ /* Do tree relocation */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
@@ -3030,11 +3027,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
err = ret;
goto out;
}
- rb_node = rb_next(rb_node);
}
out:
err = finish_pending_nodes(trans, rc, path, err);
@@ -4669,7 +4665,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
- rc->nodes_relocated, 1);
+ rc->nodes_relocated, true);
if (ret)
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..902819d3cf41 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1124,7 +1124,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (scrub_write_page_to_dev_replace(sblock_other,
page_num) != 0) {
- btrfs_dev_replace_stats_inc(
+ atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
@@ -1564,8 +1564,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
if (btrfsic_submit_bio_wait(bio)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
@@ -1592,8 +1591,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
ret = scrub_write_page_to_dev_replace(sblock, page_num);
if (ret)
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
@@ -1726,8 +1724,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
- btrfs_dev_replace_stats_inc(&dev_replace->
- num_write_errors);
+ atomic64_inc(&dev_replace->num_write_errors);
}
}
@@ -3022,8 +3019,7 @@ out:
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length,
- int is_dev_replace)
+ int num, u64 base, u64 length)
{
struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3299,7 +3295,7 @@ again:
extent_physical = extent_logical - logical + physical;
extent_dev = scrub_dev;
extent_mirror_num = mirror_num;
- if (is_dev_replace)
+ if (sctx->is_dev_replace)
scrub_remap_extent(fs_info, extent_logical,
extent_len, &extent_physical,
&extent_dev,
@@ -3397,8 +3393,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group_cache *cache,
- int is_dev_replace)
+ struct btrfs_block_group_cache *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
@@ -3435,8 +3430,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length,
- is_dev_replace);
+ chunk_offset, length);
if (ret)
goto out;
}
@@ -3449,8 +3443,7 @@ out:
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
- struct btrfs_device *scrub_dev, u64 start, u64 end,
- int is_dev_replace)
+ struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
@@ -3544,7 +3537,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(cache);
- if (!ret && is_dev_replace) {
+ if (!ret && sctx->is_dev_replace) {
/*
* If we are doing a device replace wait for any tasks
* that started dellaloc right before we set the block
@@ -3609,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, cache, is_dev_replace);
+ found_key.offset, cache);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3670,7 +3663,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
btrfs_put_block_group(cache);
if (ret)
break;
- if (is_dev_replace &&
+ if (sctx->is_dev_replace &&
atomic64_read(&dev_replace->num_write_errors) > 0) {
ret = -EIO;
break;
@@ -3893,8 +3886,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
if (!ret)
- ret = scrub_enumerate_chunks(sctx, dev, start, end,
- is_dev_replace);
+ ret = scrub_enumerate_chunks(sctx, dev, start, end);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ba8950bfd9c7..094cc1444a90 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1186,9 +1186,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
u64 root = (u64)(uintptr_t)key;
struct clone_root *cr = (struct clone_root *)elt;
- if (root < cr->root->objectid)
+ if (root < cr->root->root_key.objectid)
return -1;
- if (root > cr->root->objectid)
+ if (root > cr->root->root_key.objectid)
return 1;
return 0;
}
@@ -1198,9 +1198,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
struct clone_root *cr1 = (struct clone_root *)e1;
struct clone_root *cr2 = (struct clone_root *)e2;
- if (cr1->root->objectid < cr2->root->objectid)
+ if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
- if (cr1->root->objectid > cr2->root->objectid)
+ if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
return 1;
return 0;
}
@@ -1693,12 +1693,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
di = btrfs_lookup_dir_item(NULL, root, path,
dir, name, name_len, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
@@ -2346,7 +2342,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
}
- key.objectid = send_root->objectid;
+ key.objectid = send_root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2362,7 +2358,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->objectid) {
+ key.objectid != send_root->root_key.objectid) {
ret = -ENOENT;
goto out;
}
@@ -4907,8 +4903,8 @@ static int send_clone(struct send_ctx *sctx,
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->objectid, clone_root->ino,
- clone_root->offset);
+ offset, len, clone_root->root->root_key.objectid,
+ clone_root->ino, clone_root->offset);
p = fs_path_alloc();
if (!p)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6601c9aa5e35..b362b45dd757 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2177,8 +2177,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
- buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
+ buf->f_fsid.val[0] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
+ buf->f_fsid.val[1] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid;
return 0;
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d9269a531a4d..9e0f4a01be14 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -106,7 +106,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("should have found at least one delalloc");
@@ -137,7 +137,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("couldn't find delalloc in our range");
@@ -171,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (found) {
test_err("found range when we shouldn't have");
@@ -192,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 385a5316e4bf..bf15d3a7f20e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
- while (!RB_EMPTY_ROOT(&em_tree->map)) {
- node = rb_first(&em_tree->map);
+ while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
+ node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
remove_extent_mapping(em_tree, em);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b84f5015029..d1eeef9ec5da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.href_root.rb_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -118,7 +119,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- if (is_fstree(root->objectid))
+ if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
}
@@ -245,7 +246,7 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
- cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
@@ -759,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- if (btrfs_check_space_for_delayed_refs(trans, fs_info))
+ if (btrfs_check_space_for_delayed_refs(trans))
return 1;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -834,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
- btrfs_should_throttle_delayed_refs(trans, info);
+ btrfs_should_throttle_delayed_refs(trans);
cur = max_t(unsigned long, cur, 32);
/*
@@ -1197,7 +1198,10 @@ again:
list_add_tail(&fs_info->extent_root->dirty_list,
&trans->transaction->switch_commits);
- btrfs_after_dev_replace_commit(fs_info);
+
+ /* Update dev-replace pointer once everything is committed */
+ fs_info->dev_replace.committed_cursor_left =
+ fs_info->dev_replace.cursor_left_last_write_of_item;
return 0;
}
@@ -1613,10 +1617,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret < 0)
goto fail;
- ret = btrfs_insert_dir_item(trans, parent_root,
- dentry->d_name.name, dentry->d_name.len,
- BTRFS_I(parent_inode), &key,
- BTRFS_FT_DIR, index);
+ ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
+ dentry->d_name.len, BTRFS_I(parent_inode),
+ &key, BTRFS_FT_DIR, index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
@@ -1929,6 +1932,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
+ btrfs_trans_release_metadata(trans);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
@@ -1938,9 +1944,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- btrfs_trans_release_metadata(trans);
- trans->block_rsv = NULL;
-
cur_trans = trans->transaction;
/*
@@ -2280,15 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- /*
- * If fs has been frozen, we can not handle delayed iputs, otherwise
- * it'll result in deadlock about SB_FREEZE_FS.
- */
- if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread &&
- !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
- btrfs_run_delayed_iputs(fs_info);
-
return ret;
scrub_continue:
@@ -2330,7 +2324,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- btrfs_debug(fs_info, "cleaner removing %llu", root->objectid);
+ btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index db835635372f..cab0b1f1f741 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -487,6 +487,13 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
u32 nritems = btrfs_header_nritems(leaf);
int slot;
+ if (btrfs_header_level(leaf) != 0) {
+ generic_err(fs_info, leaf, 0,
+ "invalid level for leaf, have %d expect 0",
+ btrfs_header_level(leaf));
+ return -EUCLEAN;
+ }
+
/*
* Extent buffers from a relocation tree have a owner field that
* corresponds to the subvolume tree they are based on. So just from an
@@ -645,9 +652,16 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
unsigned long nr = btrfs_header_nritems(node);
struct btrfs_key key, next_key;
int slot;
+ int level = btrfs_header_level(node);
u64 bytenr;
int ret = 0;
+ if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ generic_err(fs_info, node, 0,
+ "invalid level for node, have %d expect [1, %d]",
+ level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3c2ae0e4f25a..e07f3376b7df 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root)
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
{
- int ret = -ENOENT;
-
mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return ret;
}
/*
@@ -258,6 +255,13 @@ struct walk_control {
/* what stage of the replay code we're currently in */
int stage;
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+ * the LOG_WALK_REPLAY_INODES stage.
+ */
+ bool ignore_cur_inode;
+
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
@@ -2487,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ /*
+ * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+ * and never got linked before the fsync, skip it, as
+ * replaying it is pointless since it would be deleted
+ * later. We skip logging tmpfiles, but it's always
+ * possible we are replaying a log created with a kernel
+ * that used to log tmpfiles.
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
@@ -2524,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root->fs_info->sectorsize);
ret = btrfs_drop_extents(wc->trans, root, inode,
from, (u64)-1, 1);
- /*
- * If the nlink count is zero here, the iput
- * will free the inode. We bump it to make
- * sure it doesn't get freed until the link
- * count fixup is done.
- */
if (!ret) {
- if (inode->i_nlink == 0)
- inc_nlink(inode);
- /* Update link count and nbytes. */
+ /* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, inode);
}
@@ -2548,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
}
+ if (wc->ignore_cur_inode)
+ continue;
+
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
@@ -3196,9 +3209,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
};
ret = walk_log_tree(trans, log, &wc);
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -4374,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
- down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
logged_start = start;
@@ -4440,7 +4455,6 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- up_write(&inode->dio_sem);
btrfs_release_path(path);
if (!ret)
@@ -4636,7 +4650,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE));
+ BTRFS_COMPRESS_NONE) ||
+ (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
@@ -5564,9 +5579,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
dir_inode = btrfs_iget(fs_info->sb, &inode_key,
root, NULL);
- /* If parent inode was deleted, skip it. */
- if (IS_ERR(dir_inode))
- continue;
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
if (ctx)
ctx->log_new_dentries = false;
@@ -5640,7 +5679,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (btrfs_inode_in_log(inode, trans->transid)) {
+ /*
+ * Skip already logged inodes or inodes corresponding to tmpfiles
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+ if (btrfs_inode_in_log(inode, trans->transid) ||
+ inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7ab9bb88a639..767765031e59 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -65,7 +65,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
-int btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
int for_rename);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4405e430da6..f435d397019e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map);
+ n = rb_last(&em_tree->map.rb_root);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
@@ -1854,6 +1854,24 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
fs_info->fs_devices->latest_bdev = next_device->bdev;
}
+/*
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
+ * currently replaced.
+ */
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
+{
+ u64 num_devices = fs_info->fs_devices->num_devices;
+
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ ASSERT(num_devices > 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+
+ return num_devices;
+}
+
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 devid)
{
@@ -1865,22 +1883,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_lock(&uuid_mutex);
- num_devices = fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- WARN_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
- &device);
- if (ret)
+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
+
+ if (IS_ERR(device)) {
+ if (PTR_ERR(device) == -ENOENT &&
+ strcmp(device_path, "missing") == 0)
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+ else
+ ret = PTR_ERR(device);
goto out;
+ }
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
@@ -2096,9 +2114,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
call_rcu(&tgtdev->rcu, free_device_rcu);
}
-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
int ret = 0;
struct btrfs_super_block *disk_super;
@@ -2106,28 +2123,27 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
u8 *dev_uuid;
struct block_device *bdev;
struct buffer_head *bh;
+ struct btrfs_device *device;
- *device = NULL;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
- return ret;
+ return ERR_PTR(ret);
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+ device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
brelse(bh);
- if (!*device)
- ret = -ENOENT;
+ if (!device)
+ device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return ret;
+ return device;
}
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_missing_or_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
- *device = NULL;
+ struct btrfs_device *device = NULL;
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
@@ -2136,42 +2152,38 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
list_for_each_entry(tmp, devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&tmp->dev_state) && !tmp->bdev) {
- *device = tmp;
+ device = tmp;
break;
}
}
- if (!*device)
- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-
- return 0;
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
- return btrfs_find_device_by_path(fs_info, device_path, device);
+ device = btrfs_find_device_by_path(fs_info, device_path);
}
+
+ return device;
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device)
+struct btrfs_device *btrfs_find_device_by_devspec(
+ struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
{
- int ret;
+ struct btrfs_device *device;
if (devid) {
- ret = 0;
- *device = btrfs_find_device(fs_info, devid, NULL, NULL);
- if (!*device)
- ret = -ENOENT;
+ device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
if (!devpath || !devpath[0])
- return -EINVAL;
-
- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
- device);
+ return ERR_PTR(-EINVAL);
+ device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
}
- return ret;
+ return device;
}
/*
@@ -3679,7 +3691,7 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return (flags & (flags - 1)) == 0;
+ return is_power_of_2(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -3740,13 +3752,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
}
- num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- BUG_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
+
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -5897,7 +5904,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
out:
if (dev_replace_is_ongoing) {
- btrfs_dev_replace_clear_lock_blocking(dev_replace);
+ ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
+ btrfs_dev_replace_read_lock(dev_replace);
+ /* Barrier implied by atomic_dec_and_test */
+ if (atomic_dec_and_test(&dev_replace->blocking_readers))
+ cond_wake_up_nomb(&dev_replace->read_lock_wq);
btrfs_dev_replace_read_unlock(dev_replace);
}
free_extent_map(em);
@@ -7438,7 +7449,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
int ret = 0;
read_lock(&em_tree->lock);
- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
em = rb_entry(node, struct extent_map, rb_node);
if (em->map_lookup->num_stripes !=
em->map_lookup->verified_stripes) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 23e9285d88de..aefce895e994 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -410,12 +410,9 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device);
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device);
+struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
+ u64 devid,
+ const char *devpath);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);