aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ctree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/ctree.c')
-rw-r--r--fs/btrfs/ctree.c2618
1 files changed, 984 insertions, 1634 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f2ec1a9bae28..a9543f01184c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -14,6 +15,8 @@
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
+#include "tree-mod-log.h"
+#include "tree-checker.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -31,8 +34,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
static const struct btrfs_csums {
u16 size;
- const char *name;
- const char *driver;
+ const char name[10];
+ const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
@@ -63,11 +66,12 @@ const char *btrfs_super_csum_name(u16 csum_type)
const char *btrfs_super_csum_driver(u16 csum_type)
{
/* csum type is validated at mount time */
- return btrfs_csums[csum_type].driver ?:
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
btrfs_csums[csum_type].name;
}
-size_t __const btrfs_get_num_csums(void)
+size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
@@ -110,6 +114,22 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+bool __cold abort_should_print_stack(int errno)
+{
+ switch (errno) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
+
+/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -143,47 +163,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
return eb;
}
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* cowonly root (everything not a reference counted cow subvolume), just get
- * put onto a simple dirty list. transaction.c walks this to make sure they
- * get properly updated on disk.
+/*
+ * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
+ * just get put onto a simple dirty list. Transaction walks this list to make
+ * sure they get properly updated on disk.
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
@@ -222,9 +205,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int level;
struct btrfs_disk_key disk_key;
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -234,7 +217,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -256,605 +240,18 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
-
- if (ret)
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
}
-enum mod_log_op {
- MOD_LOG_KEY_REPLACE,
- MOD_LOG_KEY_ADD,
- MOD_LOG_KEY_REMOVE,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING,
- MOD_LOG_MOVE_KEYS,
- MOD_LOG_ROOT_REPLACE,
-};
-
-struct tree_mod_root {
- u64 logical;
- u8 level;
-};
-
-struct tree_mod_elem {
- struct rb_node node;
- u64 logical;
- u64 seq;
- enum mod_log_op op;
-
- /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
- int slot;
-
- /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
- u64 generation;
-
- /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* this is used for op == MOD_LOG_MOVE_KEYS */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* this is used for op == MOD_LOG_ROOT_REPLACE */
- struct tree_mod_root old_root;
-};
-
-/*
- * Pull a new tree mod seq number for our operation.
- */
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
- return atomic64_inc_return(&fs_info->tree_mod_seq);
-}
-
-/*
- * This adds a new blocker to the tree mod log's blocker list if the @elem
- * passed does not already have a sequence number set. So when a caller expects
- * to record tree modifications, it should ensure to set elem->seq to zero
- * before calling btrfs_get_tree_mod_seq.
- * Returns a fresh, unused tree log modification sequence number, even if no new
- * blocker was added.
- */
-u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- write_lock(&fs_info->tree_mod_log_lock);
- if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq(fs_info);
- list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-
- return elem->seq;
-}
-
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct rb_node *next;
- struct seq_list *cur_elem;
- struct tree_mod_elem *tm;
- u64 min_seq = (u64)-1;
- u64 seq_putting = elem->seq;
-
- if (!seq_putting)
- return;
-
- write_lock(&fs_info->tree_mod_log_lock);
- list_del(&elem->list);
- elem->seq = 0;
-
- list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
- if (cur_elem->seq < min_seq) {
- if (seq_putting > cur_elem->seq) {
- /*
- * blocker with lower sequence number exists, we
- * cannot remove anything from the log
- */
- write_unlock(&fs_info->tree_mod_log_lock);
- return;
- }
- min_seq = cur_elem->seq;
- }
- }
-
- /*
- * anything that's lower than the lowest existing (read: blocked)
- * sequence number can be removed from the tree.
- */
- tm_root = &fs_info->tree_mod_log;
- for (node = rb_first(tm_root); node; node = next) {
- next = rb_next(node);
- tm = rb_entry(node, struct tree_mod_elem, node);
- if (tm->seq >= min_seq)
- continue;
- rb_erase(node, tm_root);
- kfree(tm);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-}
-
-/*
- * key order of the log:
- * node/leaf start address -> sequence
- *
- * The 'start address' is the logical address of the *new* root node
- * for root replace operations, or the logical address of the affected
- * block for all other operations.
- */
-static noinline int
-__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
-{
- struct rb_root *tm_root;
- struct rb_node **new;
- struct rb_node *parent = NULL;
- struct tree_mod_elem *cur;
-
- lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
-
- tm->seq = btrfs_inc_tree_mod_seq(fs_info);
-
- tm_root = &fs_info->tree_mod_log;
- new = &tm_root->rb_node;
- while (*new) {
- cur = rb_entry(*new, struct tree_mod_elem, node);
- parent = *new;
- if (cur->logical < tm->logical)
- new = &((*new)->rb_left);
- else if (cur->logical > tm->logical)
- new = &((*new)->rb_right);
- else if (cur->seq < tm->seq)
- new = &((*new)->rb_left);
- else if (cur->seq > tm->seq)
- new = &((*new)->rb_right);
- else
- return -EEXIST;
- }
-
- rb_link_node(&tm->node, parent, new);
- rb_insert_color(&tm->node, tm_root);
- return 0;
-}
-
-/*
- * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
- * returns zero with the tree_mod_log_lock acquired. The caller must hold
- * this until all tree mod log insertions are recorded in the rb tree and then
- * write unlock fs_info::tree_mod_log_lock.
- */
-static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb) {
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 1;
- if (eb && btrfs_header_level(eb) == 0)
- return 1;
-
- write_lock(&fs_info->tree_mod_log_lock);
- if (list_empty(&(fs_info)->tree_mod_seq_list)) {
- write_unlock(&fs_info->tree_mod_log_lock);
- return 1;
- }
-
- return 0;
-}
-
-/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
-{
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 0;
- if (eb && btrfs_header_level(eb) == 0)
- return 0;
-
- return 1;
-}
-
-static struct tree_mod_elem *
-alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
-
- tm = kzalloc(sizeof(*tm), flags);
- if (!tm)
- return NULL;
-
- tm->logical = eb->start;
- if (op != MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
- tm->op = op;
- tm->slot = slot;
- tm->generation = btrfs_node_ptr_generation(eb, slot);
- RB_CLEAR_NODE(&tm->node);
-
- return tm;
-}
-
-static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
- int ret;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm = alloc_tree_mod_elem(eb, slot, op, flags);
- if (!tm)
- return -ENOMEM;
-
- if (tree_mod_dont_log(eb->fs_info, eb)) {
- kfree(tm);
- return 0;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- kfree(tm);
-
- return ret;
-}
-
-static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
- int dst_slot, int src_slot, int nr_items)
-{
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int ret = 0;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = eb->start;
- tm->slot = src_slot;
- tm->move.dst_slot = dst_slot;
- tm->move.nr_items = nr_items;
- tm->op = MOD_LOG_MOVE_KEYS;
-
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
- locked = 1;
-
- /*
- * When we override something during the move, we log these removals.
- * This can only happen when we move towards the beginning of the
- * buffer, i.e. dst_slot < src_slot.
- */
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
- if (ret)
- goto free_tms;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- if (ret)
- goto free_tms;
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-free_tms:
- for (i = 0; i < nr_items; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
- kfree(tm);
-
- return ret;
-}
-
-static inline int
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct tree_mod_elem **tm_list,
- int nritems)
-{
- int i, j;
- int ret;
-
- for (i = nritems - 1; i >= 0; i--) {
- ret = __tree_mod_log_insert(fs_info, tm_list[i]);
- if (ret) {
- for (j = nritems - 1; j > i; j--)
- rb_erase(&tm_list[j]->node,
- &fs_info->tree_mod_log);
- return ret;
- }
- }
-
- return 0;
-}
-
-static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
- struct extent_buffer *new_root, int log_removal)
-{
- struct btrfs_fs_info *fs_info = old_root->fs_info;
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int ret = 0;
- int i;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (log_removal && btrfs_header_level(old_root) > 0) {
- nritems = btrfs_header_nritems(old_root);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list) {
- ret = -ENOMEM;
- goto free_tms;
- }
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(old_root, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
- }
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = new_root->start;
- tm->old_root.logical = old_root->start;
- tm->old_root.level = btrfs_header_level(old_root);
- tm->generation = btrfs_header_generation(old_root);
- tm->op = MOD_LOG_ROOT_REPLACE;
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
-
- if (tm_list)
- ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
- if (!ret)
- ret = __tree_mod_log_insert(fs_info, tm);
-
- write_unlock(&fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return ret;
-
-free_tms:
- if (tm_list) {
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
- }
- kfree(tm);
-
- return ret;
-}
-
-static struct tree_mod_elem *
-__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
- int smallest)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct tree_mod_elem *cur = NULL;
- struct tree_mod_elem *found = NULL;
-
- read_lock(&fs_info->tree_mod_log_lock);
- tm_root = &fs_info->tree_mod_log;
- node = tm_root->rb_node;
- while (node) {
- cur = rb_entry(node, struct tree_mod_elem, node);
- if (cur->logical < start) {
- node = node->rb_left;
- } else if (cur->logical > start) {
- node = node->rb_right;
- } else if (cur->seq < min_seq) {
- node = node->rb_left;
- } else if (!smallest) {
- /* we want the node with the highest seq */
- if (found)
- BUG_ON(found->seq > cur->seq);
- found = cur;
- node = node->rb_left;
- } else if (cur->seq > min_seq) {
- /* we want the node with the smallest seq */
- if (found)
- BUG_ON(found->seq < cur->seq);
- found = cur;
- node = node->rb_right;
- } else {
- found = cur;
- break;
- }
- }
- read_unlock(&fs_info->tree_mod_log_lock);
-
- return found;
-}
-
-/*
- * this returns the element from the log with the smallest time sequence
- * value that's in the log (the oldest log item). any element with a time
- * sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
- u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 1);
-}
-
-/*
- * this returns the element from the log with the largest time sequence
- * value that's in the log (the most recent log item). any element with
- * a time sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 0);
-}
-
-static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src, unsigned long dst_offset,
- unsigned long src_offset, int nr_items)
-{
- struct btrfs_fs_info *fs_info = dst->fs_info;
- int ret = 0;
- struct tree_mod_elem **tm_list = NULL;
- struct tree_mod_elem **tm_list_add, **tm_list_rem;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
- return 0;
-
- tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm_list_add = tm_list;
- tm_list_rem = tm_list + nr_items;
- for (i = 0; i < nr_items; i++) {
- tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
- MOD_LOG_KEY_REMOVE, GFP_NOFS);
- if (!tm_list_rem[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
- MOD_LOG_KEY_ADD, GFP_NOFS);
- if (!tm_list_add[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
- locked = 1;
-
- for (i = 0; i < nr_items; i++) {
- ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
- if (ret)
- goto free_tms;
- ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
- if (ret)
- goto free_tms;
- }
-
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nr_items * 2; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return ret;
-}
-
-static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
-{
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int i;
- int ret = 0;
-
- if (btrfs_header_level(eb) == 0)
- return 0;
-
- if (!tree_mod_need_log(eb->fs_info, NULL))
- return 0;
-
- nritems = btrfs_header_nritems(eb);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
-
- ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
-
- return ret;
-}
-
/*
* check if the tree block can be shared by multiple trees
*/
@@ -862,12 +259,11 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf)
{
/*
- * Tree blocks not in reference counted trees and tree roots
- * are never shared. If a block was allocated after the last
- * snapshot and the block was not allocated by tree relocation,
- * we know the block is not shared.
+ * Tree blocks not in shareable trees and tree roots are never shared.
+ * If a block was allocated after the last snapshot and the block was
+ * not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
buf != root->node && buf != root->commit_root &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
@@ -962,10 +358,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (new_flags != 0) {
int level = btrfs_header_level(buf);
- ret = btrfs_set_disk_extent_flags(trans,
- buf->start,
- buf->len,
- new_flags, level, 0);
+ ret = btrfs_set_disk_extent_flags(trans, buf,
+ new_flags, level);
if (ret)
return ret;
}
@@ -988,48 +382,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 parent_start,
- const struct btrfs_disk_key *disk_key,
- int level,
- u64 hint,
- u64 empty_size)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *ret;
-
- /*
- * If we are COWing a node/leaf from the extent, chunk, device or free
- * space trees, make sure that we do not finish block group creation of
- * pending block groups. We do this to avoid a deadlock.
- * COWing can result in allocation of a new chunk, and flushing pending
- * block groups (btrfs_create_pending_block_groups()) can be triggered
- * when finishing allocation of a new chunk. Creation of a pending block
- * group modifies the extent, chunk, device and free space trees,
- * therefore we could deadlock with ourselves since we are holding a
- * lock on an extent buffer that btrfs_create_pending_block_groups() may
- * try to COW later.
- * For similar reasons, we also need to delay flushing pending block
- * groups when splitting a leaf or node, from one of those trees, since
- * we are holding a write lock on it and its parent or when inserting a
- * new root node for one of those trees.
- */
- if (root == fs_info->extent_root ||
- root == fs_info->chunk_root ||
- root == fs_info->dev_root ||
- root == fs_info->free_space_root)
- trans->can_flush_pending_bgs = false;
-
- ret = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, disk_key, level,
- hint, empty_size);
- trans->can_flush_pending_bgs = true;
-
- return ret;
-}
-
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
@@ -1047,7 +399,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1060,11 +413,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (*cow_ret == buf)
unlock_orig = 1;
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -1077,8 +430,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
- cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1099,13 +453,17 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1118,32 +476,34 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
atomic_inc(&cow->refs);
- ret = tree_mod_log_insert_root(root->node, cow, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- tree_mod_log_insert_key(parent, parent_slot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ btrfs_tree_mod_log_insert_key(parent, parent_slot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
- ret = tree_mod_log_free_eb(buf);
+ ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
}
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -1153,295 +513,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * returns the logical address of the oldest predecessor of the given root.
- * entries older than time_seq are ignored.
- */
-static struct tree_mod_elem *__tree_mod_log_oldest_root(
- struct extent_buffer *eb_root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- struct tree_mod_elem *found = NULL;
- u64 root_logical = eb_root->start;
- int looped = 0;
-
- if (!time_seq)
- return NULL;
-
- /*
- * the very last operation that's logged for a root is the
- * replacement operation (if it is replaced at all). this has
- * the logical address of the *new* root, making it the very
- * first operation that's logged for this root.
- */
- while (1) {
- tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
- time_seq);
- if (!looped && !tm)
- return NULL;
- /*
- * if there are no tree operation for the oldest root, we simply
- * return it. this should only happen if that (old) root is at
- * level 0.
- */
- if (!tm)
- break;
-
- /*
- * if there's an operation that's not a root replacement, we
- * found the oldest version of our root. normally, we'll find a
- * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
- */
- if (tm->op != MOD_LOG_ROOT_REPLACE)
- break;
-
- found = tm;
- root_logical = tm->old_root.logical;
- looped = 1;
- }
-
- /* if there's no old root to return, return what we found instead */
- if (!found)
- found = tm;
-
- return found;
-}
-
-/*
- * tm is a pointer to the first operation to rewind within eb. then, all
- * previous operations will be rewound (until we reach something older than
- * time_seq).
- */
-static void
-__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
- u64 time_seq, struct tree_mod_elem *first_tm)
-{
- u32 n;
- struct rb_node *next;
- struct tree_mod_elem *tm = first_tm;
- unsigned long o_dst;
- unsigned long o_src;
- unsigned long p_size = sizeof(struct btrfs_key_ptr);
-
- n = btrfs_header_nritems(eb);
- read_lock(&fs_info->tree_mod_log_lock);
- while (tm && tm->seq >= time_seq) {
- /*
- * all the operations are recorded with the operator used for
- * the modification. as we're going backwards, we do the
- * opposite of each operation here.
- */
- switch (tm->op) {
- case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
- BUG_ON(tm->slot < n);
- /* Fallthrough */
- case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
- case MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- n++;
- break;
- case MOD_LOG_KEY_REPLACE:
- BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- break;
- case MOD_LOG_KEY_ADD:
- /* if a move operation is needed it's in the log */
- n--;
- break;
- case MOD_LOG_MOVE_KEYS:
- o_dst = btrfs_node_key_ptr_offset(tm->slot);
- o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
- memmove_extent_buffer(eb, o_dst, o_src,
- tm->move.nr_items * p_size);
- break;
- case MOD_LOG_ROOT_REPLACE:
- /*
- * this operation is special. for roots, this must be
- * handled explicitly before rewinding.
- * for non-roots, this operation may exist if the node
- * was a root: root A -> child B; then A gets empty and
- * B is promoted to the new root. in the mod log, we'll
- * have a root-replace operation for B, a tree block
- * that is no root. we simply ignore that operation.
- */
- break;
- }
- next = rb_next(&tm->node);
- if (!next)
- break;
- tm = rb_entry(next, struct tree_mod_elem, node);
- if (tm->logical != first_tm->logical)
- break;
- }
- read_unlock(&fs_info->tree_mod_log_lock);
- btrfs_set_header_nritems(eb, n);
-}
-
-/*
- * Called with eb read locked. If the buffer cannot be rewound, the same buffer
- * is returned. If rewind operations happen, a fresh buffer is returned. The
- * returned buffer is always read-locked. If the returned buffer is not the
- * input buffer, the lock on the input buffer is released and the input buffer
- * is freed (its refcount is decremented).
- */
-static struct extent_buffer *
-tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct extent_buffer *eb, u64 time_seq)
-{
- struct extent_buffer *eb_rewin;
- struct tree_mod_elem *tm;
-
- if (!time_seq)
- return eb;
-
- if (btrfs_header_level(eb) == 0)
- return eb;
-
- tm = tree_mod_log_search(fs_info, eb->start, time_seq);
- if (!tm)
- return eb;
-
- btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_read(eb);
-
- if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
- if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- btrfs_set_header_bytenr(eb_rewin, eb->start);
- btrfs_set_header_backref_rev(eb_rewin,
- btrfs_header_backref_rev(eb));
- btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
- btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
- } else {
- eb_rewin = btrfs_clone_extent_buffer(eb);
- if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- }
-
- btrfs_tree_read_unlock_blocking(eb);
- free_extent_buffer(eb);
-
- btrfs_tree_read_lock(eb_rewin);
- __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
- WARN_ON(btrfs_header_nritems(eb_rewin) >
- BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb_rewin;
-}
-
-/*
- * get_old_root() rewinds the state of @root's root node to the given @time_seq
- * value. If there are no changes, the current root->root_node is returned. If
- * anything changed in between, there's a fresh buffer allocated on which the
- * rewind operations are done. In any case, the returned buffer is read locked.
- * Returns NULL on error (with no locks held).
- */
-static inline struct extent_buffer *
-get_old_root(struct btrfs_root *root, u64 time_seq)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct tree_mod_elem *tm;
- struct extent_buffer *eb = NULL;
- struct extent_buffer *eb_root;
- u64 eb_root_owner = 0;
- struct extent_buffer *old;
- struct tree_mod_root *old_root = NULL;
- u64 old_generation = 0;
- u64 logical;
- int level;
-
- eb_root = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (!tm)
- return eb_root;
-
- if (tm->op == MOD_LOG_ROOT_REPLACE) {
- old_root = &tm->old_root;
- old_generation = tm->generation;
- logical = old_root->logical;
- level = old_root->level;
- } else {
- logical = eb_root->start;
- level = btrfs_header_level(eb_root);
- }
-
- tm = tree_mod_log_search(fs_info, logical, time_seq);
- if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, 0, level, NULL);
- if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
- if (!IS_ERR(old))
- free_extent_buffer(old);
- btrfs_warn(fs_info,
- "failed to read tree block %llu from get_old_root",
- logical);
- } else {
- eb = btrfs_clone_extent_buffer(old);
- free_extent_buffer(old);
- }
- } else if (old_root) {
- eb_root_owner = btrfs_header_owner(eb_root);
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(fs_info, logical);
- } else {
- btrfs_set_lock_blocking_read(eb_root);
- eb = btrfs_clone_extent_buffer(eb_root);
- btrfs_tree_read_unlock_blocking(eb_root);
- free_extent_buffer(eb_root);
- }
-
- if (!eb)
- return NULL;
- btrfs_tree_read_lock(eb);
- if (old_root) {
- btrfs_set_header_bytenr(eb, eb->start);
- btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, eb_root_owner);
- btrfs_set_header_level(eb, old_root->level);
- btrfs_set_header_generation(eb, old_generation);
- }
- if (tm)
- __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
- else
- WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb;
-}
-
-int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- int level;
- struct extent_buffer *eb_root = btrfs_root_node(root);
-
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
- level = tm->old_root.level;
- } else {
- level = btrfs_header_level(eb_root);
- }
- free_extent_buffer(eb_root);
-
- return level;
-}
-
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -1480,7 +551,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1500,17 +572,12 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid, fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
- trans->dirty = true;
*cow_ret = buf;
return 0;
}
search_start = buf->start & ~((u64)SZ_1G - 1);
- if (parent)
- btrfs_set_lock_blocking_write(parent);
- btrfs_set_lock_blocking_write(buf);
-
/*
* Before CoWing this block for later modification, check if it's
* the subtree root and do the delayed subtree trace if needed.
@@ -1519,12 +586,13 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
* helper function for defrag to decide if two blocks pointed to by a
@@ -1539,6 +607,22 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
return 0;
}
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static int comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
/*
* compare two keys in a memcmp fashion
*/
@@ -1551,6 +635,7 @@ static int comp_keys(const struct btrfs_disk_key *disk,
return btrfs_comp_cpu_keys(&k1, k2);
}
+#endif
/*
* same as comp_keys only with two btrfs_key's
@@ -1585,7 +670,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
u64 blocknr;
- u64 gen;
u64 search_start = *last_ret;
u64 last_block = 0;
u64 other;
@@ -1593,14 +677,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
int end_slot;
int i;
int err = 0;
- int parent_level;
- int uptodate;
u32 blocksize;
int progress_passed = 0;
struct btrfs_disk_key disk_key;
- parent_level = btrfs_header_level(parent);
-
WARN_ON(trans->transaction != fs_info->running_transaction);
WARN_ON(trans->transid != fs_info->generation);
@@ -1611,10 +691,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking_write(parent);
-
for (i = start_slot; i <= end_slot; i++) {
- struct btrfs_key first_key;
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1623,8 +700,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
progress_passed = 1;
blocknr = btrfs_node_blockptr(parent, i);
- gen = btrfs_node_ptr_generation(parent, i);
- btrfs_node_key_to_cpu(parent, &first_key, i);
if (last_block == 0)
last_block = blocknr;
@@ -1641,40 +716,18 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = find_extent_buffer(fs_info, blocknr);
- if (cur)
- uptodate = btrfs_buffer_uptodate(cur, gen, 0);
- else
- uptodate = 0;
- if (!cur || !uptodate) {
- if (!cur) {
- cur = read_tree_block(fs_info, blocknr, gen,
- parent_level - 1,
- &first_key);
- if (IS_ERR(cur)) {
- return PTR_ERR(cur);
- } else if (!extent_buffer_uptodate(cur)) {
- free_extent_buffer(cur);
- return -EIO;
- }
- } else if (!uptodate) {
- err = btrfs_read_buffer(cur, gen,
- parent_level - 1,&first_key);
- if (err) {
- free_extent_buffer(cur);
- return err;
- }
- }
- }
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
if (search_start == 0)
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1690,31 +743,26 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
}
/*
- * search for key in the extent_buffer. The items start at offset p,
- * and they are item_size apart. There are 'max' items in p.
+ * Search for a key in the given extent_buffer.
*
- * the slot in the array is returned via slot, and it points to
- * the place where you would insert key if it is not found in
- * the array.
+ * The lower boundary for the search is specified by the slot number @low. Use a
+ * value of 0 to search over the whole extent buffer.
*
- * slot may point to max if the key is bigger than all of the keys
+ * The slot in the extent buffer is returned via @slot. If the key exists in the
+ * extent buffer, then @slot will point to the slot where the key is, otherwise
+ * it points to the slot where you would insert the key.
+ *
+ * Slot may point to the total number of items (i.e. one position beyond the last
+ * key) if the key is bigger than the last key in the extent buffer.
*/
-static noinline int generic_bin_search(struct extent_buffer *eb,
- unsigned long p, int item_size,
- const struct btrfs_key *key,
- int max, int *slot)
+static noinline int generic_bin_search(struct extent_buffer *eb, int low,
+ const struct btrfs_key *key, int *slot)
{
- int low = 0;
- int high = max;
- int mid;
+ unsigned long p;
+ int item_size;
+ int high = btrfs_header_nritems(eb);
int ret;
- struct btrfs_disk_key *tmp = NULL;
- struct btrfs_disk_key unaligned;
- unsigned long offset;
- char *kaddr = NULL;
- unsigned long map_start = 0;
- unsigned long map_len = 0;
- int err;
+ const int key_size = sizeof(struct btrfs_disk_key);
if (low > high) {
btrfs_err(eb->fs_info,
@@ -1724,33 +772,36 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
return -EINVAL;
}
+ if (btrfs_header_level(eb) == 0) {
+ p = offsetof(struct btrfs_leaf, items);
+ item_size = sizeof(struct btrfs_item);
+ } else {
+ p = offsetof(struct btrfs_node, ptrs);
+ item_size = sizeof(struct btrfs_key_ptr);
+ }
+
while (low < high) {
+ unsigned long oip;
+ unsigned long offset;
+ struct btrfs_disk_key *tmp;
+ struct btrfs_disk_key unaligned;
+ int mid;
+
mid = (low + high) / 2;
offset = p + mid * item_size;
+ oip = offset_in_page(offset);
- if (!kaddr || offset < map_start ||
- (offset + sizeof(struct btrfs_disk_key)) >
- map_start + map_len) {
-
- err = map_private_extent_buffer(eb, offset,
- sizeof(struct btrfs_disk_key),
- &kaddr, &map_start, &map_len);
-
- if (!err) {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
- } else if (err == 1) {
- read_extent_buffer(eb, &unaligned,
- offset, sizeof(unaligned));
- tmp = &unaligned;
- } else {
- return err;
- }
+ if (oip + key_size <= PAGE_SIZE) {
+ const unsigned long idx = get_eb_page_index(offset);
+ char *kaddr = page_address(eb->pages[idx]);
+ oip = get_eb_offset_in_page(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oip);
} else {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
+ read_extent_buffer(eb, &unaligned, offset, key_size);
+ tmp = &unaligned;
}
+
ret = comp_keys(tmp, key);
if (ret < 0)
@@ -1767,24 +818,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
}
/*
- * simple bin_search frontend that does the right thing for
- * leaves vs nodes
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
+ int *slot)
{
- if (level == 0)
- return generic_bin_search(eb,
- offsetof(struct btrfs_leaf, items),
- sizeof(struct btrfs_item),
- key, btrfs_header_nritems(eb),
- slot);
- else
- return generic_bin_search(eb,
- offsetof(struct btrfs_node, ptrs),
- sizeof(struct btrfs_key_ptr),
- key, btrfs_header_nritems(eb),
- slot);
+ return generic_bin_search(eb, 0, key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
@@ -1820,11 +860,14 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_node_key_to_cpu(parent, &first_key, slot);
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
+ btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
- if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb))
+ return eb;
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
- eb = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
return eb;
@@ -1854,8 +897,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
- WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
- path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
+ WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
WARN_ON(btrfs_header_generation(mid) != trans->transid);
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1884,15 +926,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
goto enospc;
}
- ret = tree_mod_log_insert_root(root->node, child, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, child);
@@ -1907,7 +949,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
@@ -1921,10 +963,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = NULL;
if (left) {
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking_write(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1936,10 +978,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
if (right) {
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking_write(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1966,14 +1008,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), right,
+ 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2011,15 +1054,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2099,15 +1142,15 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking_write(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2120,8 +1163,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2153,8 +1196,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking_write(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2162,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2175,8 +1217,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2214,12 +1256,12 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
u64 search;
u64 target;
u64 nread = 0;
- struct extent_buffer *eb;
+ u64 nread_max;
u32 nr;
u32 blocksize;
u32 nscan = 0;
- if (level != 1)
+ if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
return;
if (!path->nodes[level])
@@ -2227,12 +1269,30 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
node = path->nodes[level];
+ /*
+ * Since the time between visiting leaves is much shorter than the time
+ * between visiting nodes, limit read ahead of nodes to 1, to avoid too
+ * much IO at once (possibly random).
+ */
+ if (path->reada == READA_FORWARD_ALWAYS) {
+ if (level > 1)
+ nread_max = node->fs_info->nodesize;
+ else
+ nread_max = SZ_128K;
+ } else {
+ nread_max = SZ_64K;
+ }
+
search = btrfs_node_blockptr(node, slot);
blocksize = fs_info->nodesize;
- eb = find_extent_buffer(fs_info, search);
- if (eb) {
- free_extent_buffer(eb);
- return;
+ if (path->reada != READA_FORWARD_ALWAYS) {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer(fs_info, search);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
}
target = search;
@@ -2245,7 +1305,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
if (nr == 0)
break;
nr--;
- } else if (path->reada == READA_FORWARD) {
+ } else if (path->reada == READA_FORWARD ||
+ path->reada == READA_FORWARD_ALWAYS) {
nr++;
if (nr >= nritems)
break;
@@ -2256,27 +1317,23 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
break;
}
search = btrfs_node_blockptr(node, nr);
- if ((search <= target && target - search <= 65536) ||
+ if (path->reada == READA_FORWARD_ALWAYS ||
+ (search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
- readahead_tree_block(fs_info, search);
+ btrfs_readahead_node_child(node, nr);
nread += blocksize;
}
nscan++;
- if ((nread > 65536 || nscan > 32))
+ if (nread > nread_max || nscan > 32)
break;
}
}
-static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_path *path, int level)
{
+ struct extent_buffer *parent;
int slot;
int nritems;
- struct extent_buffer *parent;
- struct extent_buffer *eb;
- u64 gen;
- u64 block1 = 0;
- u64 block2 = 0;
parent = path->nodes[level + 1];
if (!parent)
@@ -2285,32 +1342,10 @@ static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- if (slot > 0) {
- block1 = btrfs_node_blockptr(parent, slot - 1);
- gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = find_extent_buffer(fs_info, block1);
- /*
- * if we get -eagain from btrfs_buffer_uptodate, we
- * don't want to return eagain here. That will loop
- * forever
- */
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block1 = 0;
- free_extent_buffer(eb);
- }
- if (slot + 1 < nritems) {
- block2 = btrfs_node_blockptr(parent, slot + 1);
- gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = find_extent_buffer(fs_info, block2);
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block2 = 0;
- free_extent_buffer(eb);
- }
-
- if (block1)
- readahead_tree_block(fs_info, block1);
- if (block2)
- readahead_tree_block(fs_info, block2);
+ if (slot > 0)
+ btrfs_readahead_node_child(parent, slot - 1);
+ if (slot + 1 < nritems)
+ btrfs_readahead_node_child(parent, slot + 1);
}
@@ -2333,33 +1368,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
{
int i;
int skip_level = level;
- int no_skips = 0;
- struct extent_buffer *t;
+ bool check_skip = true;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
if (!path->locks[i])
break;
- if (!no_skips && path->slots[i] == 0) {
- skip_level = i + 1;
- continue;
- }
- if (!no_skips && path->keep_locks) {
- u32 nritems;
- t = path->nodes[i];
- nritems = btrfs_header_nritems(t);
- if (nritems < 1 || path->slots[i] >= nritems - 1) {
+
+ if (check_skip) {
+ if (path->slots[i] == 0) {
skip_level = i + 1;
continue;
}
+
+ if (path->keep_locks) {
+ u32 nritems;
+
+ nritems = btrfs_header_nritems(path->nodes[i]);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
}
- if (skip_level < i && i >= lowest_unlock)
- no_skips = 1;
- t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level) {
- btrfs_tree_unlock_rw(t, path->locks[i]);
+ check_skip = false;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
i > min_write_lock_level &&
@@ -2371,12 +1407,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * helper function for btrfs_search_slot. The goal is to find a block
- * in cache without setting the path to blocking. If we find the block
- * we return zero and the path is unchanged.
+ * Helper function for btrfs_search_slot() and other functions that do a search
+ * on a btree. The goal is to find a tree block in the cache (the radix tree at
+ * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
+ * its pages from disk.
*
- * If we can't find the block, we set the path blocking and do some
- * reada. -EAGAIN is returned and the search must be repeated.
+ * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
+ * whole btree search, starting again from the current root node.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
@@ -2386,19 +1423,30 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct btrfs_fs_info *fs_info = root->fs_info;
u64 blocknr;
u64 gen;
- struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
struct btrfs_key first_key;
int ret;
int parent_level;
+ bool unlock_up;
- blocknr = btrfs_node_blockptr(b, slot);
- gen = btrfs_node_ptr_generation(b, slot);
- parent_level = btrfs_header_level(b);
- btrfs_node_key_to_cpu(b, &first_key, slot);
+ unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
+ blocknr = btrfs_node_blockptr(*eb_ret, slot);
+ gen = btrfs_node_ptr_generation(*eb_ret, slot);
+ parent_level = btrfs_header_level(*eb_ret);
+ btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ /*
+ * If we need to read an extent buffer from disk and we are holding locks
+ * on upper level nodes, we unlock all the upper nodes before reading the
+ * extent buffer, and then return -EAGAIN to the caller as it needs to
+ * restart the search. We don't release the lock on the current level
+ * because we need to walk this node to figure out which blocks to read.
+ */
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
+ if (p->reada == READA_FORWARD_ALWAYS)
+ reada_for_search(fs_info, p, level, slot, key->objectid);
+
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
/*
@@ -2415,56 +1463,68 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
- /* the pages were up to date, but we failed
- * the generation number check. Do a full
- * read for the generation number that is correct.
- * We must do this without dropping locks so
- * we can trust our generation number
- */
- btrfs_set_path_blocking(p);
+ if (p->nowait) {
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+ }
+
+ if (unlock_up)
+ btrfs_unlock_up_safe(p, level + 1);
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
- if (!ret) {
- *eb_ret = tmp;
- return 0;
+ ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
+ if (ret) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EUCLEAN;
+ }
+
+ if (unlock_up)
+ ret = -EAGAIN;
+
+ goto out;
+ } else if (p->nowait) {
+ return -EAGAIN;
}
- /*
- * reduce lock contention at high levels
- * of the btree by dropping locks before
- * we read. Don't release the lock on the current
- * level because we need to walk this node to figure
- * out which blocks to read.
- */
- btrfs_unlock_up_safe(p, level + 1);
- btrfs_set_path_blocking(p);
+ if (unlock_up) {
+ btrfs_unlock_up_safe(p, level + 1);
+ ret = -EAGAIN;
+ } else {
+ ret = 0;
+ }
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- ret = -EAGAIN;
- tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
- &first_key);
- if (!IS_ERR(tmp)) {
- /*
- * If the read above didn't mark this buffer up to date,
- * it will never end up being up to date. Set ret to EIO now
- * and give up so that our caller doesn't loop forever
- * on our EAGAINs.
- */
- if (!extent_buffer_uptodate(tmp))
- ret = -EIO;
- free_extent_buffer(tmp);
+ tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
+ gen, parent_level - 1, &first_key);
+ if (IS_ERR(tmp)) {
+ btrfs_release_path(p);
+ return PTR_ERR(tmp);
+ }
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!extent_buffer_uptodate(tmp))
+ ret = -EIO;
+
+out:
+ if (ret == 0) {
+ *eb_ret = tmp;
} else {
- ret = PTR_ERR(tmp);
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
}
- btrfs_release_path(p);
return ret;
}
@@ -2484,74 +1544,45 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
int *write_lock_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = split_node(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = split_node(trans, root, p, level);
- BUG_ON(sret > 0);
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
} else if (ins_len < 0 && btrfs_header_nritems(b) <
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = balance_level(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = balance_level(trans, root, p, level);
+ if (ret)
+ return ret;
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
if (!b) {
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
BUG_ON(btrfs_header_nritems(b) == 1);
}
- return 0;
-
-again:
- ret = -EAGAIN;
-done:
return ret;
}
-static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
- int level, int *prev_cmp, int *slot)
-{
- if (*prev_cmp != 0) {
- *prev_cmp = btrfs_bin_search(b, key, level, slot);
- return *prev_cmp;
- }
-
- *slot = 0;
-
- return 0;
-}
-
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
@@ -2591,35 +1622,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
struct btrfs_path *p,
int write_lock_level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
- /*
- * The commit roots are read only so we always do read locks,
- * and we always must hold the commit_root_sem when doing
- * searches on them, the only exception is send where we don't
- * want to block transaction commits for a long time, so
- * we need to clone the commit root in order to avoid races
- * with transaction commits that create a snapshot of one of
- * the roots used by a send operation.
- */
- if (p->need_commit_sem) {
- down_read(&fs_info->commit_root_sem);
- b = btrfs_clone_extent_buffer(root->commit_root);
- up_read(&fs_info->commit_root_sem);
- if (!b)
- return ERR_PTR(-ENOMEM);
-
- } else {
- b = root->commit_root;
- atomic_inc(&b->refs);
- }
+ b = root->commit_root;
+ atomic_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
@@ -2636,6 +1645,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -2645,7 +1657,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ if (p->nowait) {
+ b = btrfs_try_read_lock_root_node(root);
+ if (IS_ERR(b))
+ return b;
+ } else {
+ b = btrfs_read_lock_root_node(root);
+ }
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2662,6 +1680,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -2671,6 +1700,191 @@ out:
return b;
}
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+ const int i = path->lowest_level;
+ const int slot = path->slots[i];
+ struct extent_buffer *lowest = path->nodes[i];
+ struct extent_buffer *clone;
+
+ ASSERT(path->need_commit_sem);
+
+ if (!lowest)
+ return 0;
+
+ lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+ clone = btrfs_clone_extent_buffer(lowest);
+ if (!clone)
+ return -ENOMEM;
+
+ btrfs_release_path(path);
+ path->nodes[i] = clone;
+ path->slots[i] = slot;
+
+ return 0;
+}
+
+static inline int search_for_key_slot(struct extent_buffer *eb,
+ int search_low_slot,
+ const struct btrfs_key *key,
+ int prev_cmp,
+ int *slot)
+{
+ /*
+ * If a previous call to btrfs_bin_search() on a parent node returned an
+ * exact match (prev_cmp == 0), we can safely assume the target key will
+ * always be at slot 0 on lower levels, since each key pointer
+ * (struct btrfs_key_ptr) refers to the lowest key accessible from the
+ * subtree it points to. Thus we can skip searching lower levels.
+ */
+ if (prev_cmp == 0) {
+ *slot = 0;
+ return 0;
+ }
+
+ return generic_bin_search(eb, search_low_slot, key, slot);
+}
+
+static int search_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *path,
+ int ins_len,
+ int prev_cmp)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int leaf_free_space = -1;
+ int search_low_slot = 0;
+ int ret;
+ bool do_bin_search = true;
+
+ /*
+ * If we are doing an insertion, the leaf has enough free space and the
+ * destination slot for the key is not slot 0, then we can unlock our
+ * write lock on the parent, and any other upper nodes, before doing the
+ * binary search on the leaf (with search_for_key_slot()), allowing other
+ * tasks to lock the parent and any other upper nodes.
+ */
+ if (ins_len > 0) {
+ /*
+ * Cache the leaf free space, since we will need it later and it
+ * will not change until then.
+ */
+ leaf_free_space = btrfs_leaf_free_space(leaf);
+
+ /*
+ * !path->locks[1] means we have a single node tree, the leaf is
+ * the root of the tree.
+ */
+ if (path->locks[1] && leaf_free_space >= ins_len) {
+ struct btrfs_disk_key first_key;
+
+ ASSERT(btrfs_header_nritems(leaf) > 0);
+ btrfs_item_key(leaf, &first_key, 0);
+
+ /*
+ * Doing the extra comparison with the first key is cheap,
+ * taking into account that the first key is very likely
+ * already in a cache line because it immediately follows
+ * the extent buffer's header and we have recently accessed
+ * the header's level field.
+ */
+ ret = comp_keys(&first_key, key);
+ if (ret < 0) {
+ /*
+ * The first key is smaller than the key we want
+ * to insert, so we are safe to unlock all upper
+ * nodes and we have to do the binary search.
+ *
+ * We do use btrfs_unlock_up_safe() and not
+ * unlock_up() because the later does not unlock
+ * nodes with a slot of 0 - we can safely unlock
+ * any node even if its slot is 0 since in this
+ * case the key does not end up at slot 0 of the
+ * leaf and there's no need to split the leaf.
+ */
+ btrfs_unlock_up_safe(path, 1);
+ search_low_slot = 1;
+ } else {
+ /*
+ * The first key is >= then the key we want to
+ * insert, so we can skip the binary search as
+ * the target key will be at slot 0.
+ *
+ * We can not unlock upper nodes when the key is
+ * less than the first key, because we will need
+ * to update the key at slot 0 of the parent node
+ * and possibly of other upper nodes too.
+ * If the key matches the first key, then we can
+ * unlock all the upper nodes, using
+ * btrfs_unlock_up_safe() instead of unlock_up()
+ * as stated above.
+ */
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
+ /*
+ * ret is already 0 or 1, matching the result of
+ * a btrfs_bin_search() call, so there is no need
+ * to adjust it.
+ */
+ do_bin_search = false;
+ path->slots[0] = 0;
+ }
+ }
+ }
+
+ if (do_bin_search) {
+ ret = search_for_key_slot(leaf, search_low_slot, key,
+ prev_cmp, &path->slots[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (ins_len > 0) {
+ /*
+ * Item key already exists. In this case, if we are allowed to
+ * insert the item (for example, in dir_item case, item key
+ * collision is allowed), it will be merged with the original
+ * item. Only the item size grows, no new btrfs item will be
+ * added. If search_for_extension is not set, ins_len already
+ * accounts the size btrfs_item, deduct it here so leaf space
+ * check will be correct.
+ */
+ if (ret == 0 && !path->search_for_extension) {
+ ASSERT(ins_len >= sizeof(struct btrfs_item));
+ ins_len -= sizeof(struct btrfs_item);
+ }
+
+ ASSERT(leaf_free_space >= 0);
+
+ if (leaf_free_space < ins_len) {
+ int err;
+
+ err = split_leaf(trans, root, key, path, ins_len,
+ (ret == 0));
+ ASSERT(err <= 0);
+ if (WARN_ON(err > 0))
+ err = -EUCLEAN;
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
@@ -2680,8 +1894,14 @@ out:
* @p: Holds all btree nodes along the search path
* @root: The root node of the tree
* @key: The key we are looking for
- * @ins_len: Indicates purpose of search, for inserts it is 1, for
- * deletions it's -1. 0 for plain searches
+ * @ins_len: Indicates purpose of search:
+ * >0 for inserts it's size of item inserted (*)
+ * <0 for deletions
+ * 0 for plain searches, not modifying the tree
+ *
+ * (*) If size of item inserted doesn't include
+ * sizeof(struct btrfs_item), then p->search_for_extension must
+ * be set.
* @cow: boolean should CoW operations be performed. Must always be 1
* when modifying the tree.
*
@@ -2701,6 +1921,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -2718,6 +1939,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
WARN_ON(p->nodes[0] != NULL);
BUG_ON(!cow && ins_len);
+ /*
+ * For now only allow nowait for read only operations. There's no
+ * strict reason why we can't, we just only need it for reads so it's
+ * only implemented for reads.
+ */
+ ASSERT(!p->nowait || !cow);
+
if (ins_len < 0) {
lowest_unlock = 2;
@@ -2742,6 +1970,16 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
min_write_lock_level = write_lock_level;
+ if (p->need_commit_sem) {
+ ASSERT(p->search_commit_root);
+ if (p->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem))
+ return -EAGAIN;
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
+ }
+
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
@@ -2763,10 +2001,8 @@ again:
* then we don't want to set the path blocking,
* so we test it here
*/
- if (!should_cow_block(trans, root, b)) {
- trans->dirty = true;
+ if (!should_cow_block(trans, root, b))
goto cow_done;
- }
/*
* must have write locks on this node and the
@@ -2781,14 +2017,15 @@ again:
goto again;
}
- btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2796,10 +2033,6 @@ again:
}
cow_done:
p->nodes[level] = b;
- /*
- * Leave path with blocking locks to avoid massive
- * lock context switch, this is made on purpose.
- */
/*
* we have a lock on b and as long as we aren't changing
@@ -2821,35 +2054,22 @@ cow_done:
}
}
- ret = key_search(b, key, level, &prev_cmp, &slot);
- if (ret < 0)
- goto done;
-
if (level == 0) {
- p->slots[level] = slot;
- if (ins_len > 0 &&
- btrfs_leaf_free_space(b) < ins_len) {
- if (write_lock_level < 1) {
- write_lock_level = 1;
- btrfs_release_path(p);
- goto again;
- }
+ if (ins_len > 0)
+ ASSERT(write_lock_level >= 1);
- btrfs_set_path_blocking(p);
- err = split_leaf(trans, root, key,
- p, ins_len, ret == 0);
-
- BUG_ON(err > 0);
- if (err) {
- ret = err;
- goto done;
- }
- }
+ ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
min_write_lock_level, NULL);
goto done;
}
+
+ ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
+ prev_cmp = ret;
+
if (ret && slot > 0) {
dec = 1;
slot--;
@@ -2896,15 +2116,20 @@ cow_done:
if (!p->skip_locking) {
level = btrfs_header_level(b);
+
+ btrfs_maybe_reset_lockdep_class(root, b);
+
if (level <= write_lock_level) {
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
+ btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
+ if (p->nowait) {
+ if (!btrfs_try_tree_read_lock(b)) {
+ free_extent_buffer(b);
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
btrfs_tree_read_lock(b);
}
p->locks[level] = BTRFS_READ_LOCK;
@@ -2914,16 +2139,21 @@ cow_done:
}
ret = 1;
done:
- /*
- * we don't really know what they plan on doing with the path
- * from here on, so for now just mark it as blocking
- */
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
+
+ if (p->need_commit_sem) {
+ int ret2;
+
+ ret2 = finish_need_commit_sem_search(p);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
/*
* Like btrfs_search_slot, this looks for a key in the given tree. It uses the
@@ -2947,10 +2177,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
- int prev_cmp = -1;
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
+ ASSERT(!p->nowait);
if (p->search_commit_root) {
BUG_ON(time_seq);
@@ -2958,7 +2188,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
}
again:
- b = get_old_root(root, time_seq);
+ b = btrfs_get_old_root(root, time_seq);
if (!b) {
ret = -EIO;
goto done;
@@ -2980,12 +2210,7 @@ again:
*/
btrfs_unlock_up_safe(p, level + 1);
- /*
- * Since we can unwind ebs we want to do a real search every
- * time.
- */
- prev_cmp = -1;
- ret = key_search(b, key, level, &prev_cmp, &slot);
+ ret = btrfs_bin_search(b, key, &slot);
if (ret < 0)
goto done;
@@ -3017,11 +2242,8 @@ again:
}
level = btrfs_header_level(b);
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ btrfs_tree_read_lock(b);
+ b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -3031,8 +2253,6 @@ again:
}
ret = 1;
done:
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0)
btrfs_release_path(p);
@@ -3117,6 +2337,64 @@ again:
}
/*
+ * Execute search and call btrfs_previous_item to traverse backwards if the item
+ * was not found.
+ *
+ * Return 0 if found, 1 if not found and < 0 if error.
+ */
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ int ret;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret > 0)
+ ret = btrfs_previous_item(root, path, key->objectid, key->type);
+
+ if (ret == 0)
+ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+
+ return ret;
+}
+
+/**
+ * Search for a valid slot for the given path.
+ *
+ * @root: The root node of the tree.
+ * @key: Will contain a valid item if found.
+ * @path: The starting point to validate the slot.
+ *
+ * Return: 0 if the item is valid
+ * 1 if not found
+ * <0 if error.
+ */
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ while (1) {
+ int ret;
+ const int slot = path->slots[0];
+ const struct extent_buffer *leaf = path->nodes[0];
+
+ /* This is where we start walking the path. */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset the path.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
+ continue;
+ }
+ /* Store the found, valid item in @key. */
+ btrfs_item_key_to_cpu(leaf, key, slot);
+ break;
+ }
+ return 0;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -3137,8 +2415,8 @@ static void fixup_low_keys(struct btrfs_path *path,
if (!path->nodes[i])
break;
t = path->nodes[i];
- ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
- GFP_ATOMIC);
+ ret = btrfs_tree_mod_log_insert_key(t, tslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
@@ -3200,6 +2478,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3243,7 +2573,13 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
- ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3255,8 +2591,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
if (push_items < src_nritems) {
/*
- * Don't call tree_mod_log_insert_move here, key removal was
- * already fully logged by tree_mod_log_eb_copy above.
+ * Don't call btrfs_tree_mod_log_insert_move() here, key removal
+ * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
@@ -3311,15 +2647,21 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
- ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
- ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
- push_items);
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
+ push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3366,8 +2708,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3384,7 +2727,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- ret = tree_mod_log_insert_root(root->node, c, 0);
+ ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, c);
@@ -3394,7 +2737,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
atomic_inc(&c->refs);
path->nodes[level] = c;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
return 0;
}
@@ -3416,15 +2759,15 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
int ret;
BUG_ON(!path->nodes[level]);
- btrfs_assert_tree_locked(path->nodes[level]);
+ btrfs_assert_tree_write_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
if (slot != nritems) {
if (level) {
- ret = tree_mod_log_insert_move(lower, slot + 1, slot,
- nritems - slot);
+ ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
+ slot, nritems - slot);
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
@@ -3433,8 +2776,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
- ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(lower, slot,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -3475,9 +2818,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
* tree mod log: We don't log_removal old root in
* insert_new_root, because that root buffer will be kept as a
* normal node. We are going to log removal of half of the
- * elements below with tree_mod_log_eb_copy. We're holding a
- * tree lock on the buffer, which is why we cannot race with
- * other tree_mod_log users.
+ * elements below with btrfs_tree_mod_log_eb_copy(). We're
+ * holding a tree lock on the buffer, which is why we cannot
+ * race with other tree_mod_log users.
*/
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
@@ -3496,15 +2839,16 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0,
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
root_add_used(root, fs_info->nodesize);
ASSERT(btrfs_header_level(c) == level);
- ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
+ ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3515,7 +2859,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
- ret = 0;
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
@@ -3533,7 +2876,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(split);
free_extent_buffer(split);
}
- return ret;
+ return 0;
}
/*
@@ -3543,21 +2886,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
- struct btrfs_item *start_item;
- struct btrfs_item *end_item;
- struct btrfs_map_token token;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- btrfs_init_map_token(&token, l);
- start_item = btrfs_item_nr(start);
- end_item = btrfs_item_nr(end);
- data_len = btrfs_token_item_offset(l, start_item, &token) +
- btrfs_token_item_size(l, start_item, &token);
- data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
+ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
+ data_len = data_len - btrfs_item_offset(l, end);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -3604,7 +2940,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
u32 i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 nr;
u32 right_nritems;
u32 data_end;
@@ -3621,8 +2956,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] > i)
break;
@@ -3637,12 +2970,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(left, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(left, i);
+ if (this_item_size + sizeof(struct btrfs_item) +
+ push_space > free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
if (i == 0)
break;
i--;
@@ -3656,7 +2990,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* push left to right */
right_nritems = btrfs_header_nritems(right);
- push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space = btrfs_item_data_end(left, left_nritems - push_items);
push_space -= leaf_data_end(left);
/* make room in the right data area */
@@ -3687,9 +3021,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
- push_space -= btrfs_token_item_size(right, item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space -= btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
left_nritems -= push_items;
@@ -3758,7 +3091,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (slot >= btrfs_header_nritems(upper) - 1)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
/*
@@ -3768,27 +3101,27 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking_write(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
goto out_unlock;
- /* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(right);
- if (free_space < data_size)
- goto out_unlock;
-
left_nritems = btrfs_header_nritems(left);
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3829,7 +3162,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
int i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 old_left_nritems;
u32 nr;
int ret = 0;
@@ -3843,8 +3175,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] < i)
break;
@@ -3859,12 +3189,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(right, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(right, i);
+ if (this_item_size + sizeof(struct btrfs_item) + push_space >
+ free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
}
if (push_items == 0) {
@@ -3880,27 +3211,24 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
push_items * sizeof(struct btrfs_item));
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
- btrfs_item_offset_nr(right, push_items - 1);
+ btrfs_item_offset(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
- btrfs_item_offset_nr(right, push_items - 1),
+ btrfs_item_offset(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
btrfs_init_map_token(&token, left);
- old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
-
- ioff = btrfs_token_item_offset(left, item, &token);
- btrfs_set_token_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
- &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -3910,7 +3238,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
right_nritems);
if (push_items < right_nritems) {
- push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
@@ -3928,11 +3256,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
-
- push_space = push_space - btrfs_token_item_size(right,
- item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space = push_space - btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
btrfs_mark_buffer_dirty(left);
@@ -3993,7 +3318,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (right_nritems == 0)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
@@ -4003,8 +3328,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking_write(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
@@ -4012,9 +3336,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- /* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -4022,12 +3346,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(left);
- if (free_space < data_size) {
- ret = 1;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
goto out;
}
-
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4056,7 +3378,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
+ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
@@ -4067,16 +3389,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(l), data_copy_size);
- rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
- ioff = btrfs_token_item_offset(right, item, &token);
- btrfs_set_token_item_offset(right, item,
- ioff + rt_data_off, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -4192,7 +3512,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
- if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ if (extend && data_size + btrfs_item_size(l, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
return -EOVERFLOW;
@@ -4277,8 +3597,18 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0,
+ num_doubles ? BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4351,7 +3681,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
if (btrfs_leaf_free_space(leaf) >= ins_len)
return 0;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (key.type == BTRFS_EXTENT_DATA_KEY) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -4371,7 +3701,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
leaf = path->nodes[0];
/* if our item isn't there, return now */
- if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ if (item_size != btrfs_item_size(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
@@ -4385,7 +3715,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
goto err;
}
- btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &key, path, ins_len, 1);
if (ret)
goto err;
@@ -4403,9 +3732,7 @@ static noinline int split_item(struct btrfs_path *path,
unsigned long split_offset)
{
struct extent_buffer *leaf;
- struct btrfs_item *item;
- struct btrfs_item *new_item;
- int slot;
+ int orig_slot, slot;
char *buf;
u32 nritems;
u32 item_size;
@@ -4415,11 +3742,9 @@ static noinline int split_item(struct btrfs_path *path,
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
- btrfs_set_path_blocking(path);
-
- item = btrfs_item_nr(path->slots[0]);
- orig_offset = btrfs_item_offset(leaf, item);
- item_size = btrfs_item_size(leaf, item);
+ orig_slot = path->slots[0];
+ orig_offset = btrfs_item_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
buf = kmalloc(item_size, GFP_NOFS);
if (!buf)
@@ -4440,14 +3765,12 @@ static noinline int split_item(struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(leaf, &disk_key, slot);
- new_item = btrfs_item_nr(slot);
+ btrfs_set_item_offset(leaf, slot, orig_offset);
+ btrfs_set_item_size(leaf, slot, item_size - split_offset);
- btrfs_set_item_offset(leaf, new_item, orig_offset);
- btrfs_set_item_size(leaf, new_item, item_size - split_offset);
-
- btrfs_set_item_offset(leaf, item,
- orig_offset + item_size - split_offset);
- btrfs_set_item_size(leaf, item, split_offset);
+ btrfs_set_item_offset(leaf, orig_slot,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, orig_slot, split_offset);
btrfs_set_header_nritems(leaf, nritems + 1);
@@ -4499,42 +3822,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
}
/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct btrfs_key *new_key)
-{
- struct extent_buffer *leaf;
- int ret;
- u32 item_size;
-
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ret = setup_leaf_for_split(trans, root, path,
- item_size + sizeof(struct btrfs_item));
- if (ret)
- return ret;
-
- path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
- leaf = path->nodes[0];
- memcpy_extent_buffer(leaf,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
- item_size);
- return 0;
-}
-
-/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
@@ -4544,7 +3831,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data_start;
@@ -4556,14 +3842,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
leaf = path->nodes[0];
slot = path->slots[0];
- old_size = btrfs_item_size_nr(leaf, slot);
+ old_size = btrfs_item_size(leaf, slot);
if (old_size == new_size)
return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
- old_data_start = btrfs_item_offset_nr(leaf, slot);
+ old_data_start = btrfs_item_offset(leaf, slot);
size_diff = old_size - new_size;
@@ -4577,11 +3863,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + size_diff, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + size_diff);
}
/* shift the data */
@@ -4624,8 +3908,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
fixup_low_keys(path, &disk_key, 1);
}
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, new_size);
+ btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -4641,7 +3924,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data;
@@ -4659,7 +3941,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
BUG();
}
slot = path->slots[0];
- old_data = btrfs_item_end_nr(leaf, slot);
+ old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
if (slot >= nritems) {
@@ -4676,11 +3958,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - data_size, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff - data_size);
}
/* shift the data */
@@ -4689,9 +3969,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
data_end, old_data - data_end);
data_end = old_data;
- old_size = btrfs_item_size_nr(leaf, slot);
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, old_size + data_size);
+ old_size = btrfs_item_size(leaf, slot);
+ btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -4700,17 +3979,19 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @batch: information about the batch of items to insert
*/
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
@@ -4718,9 +3999,15 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ /*
+ * Before anything else, update keys in the parent and other ancestors
+ * if needed, then release the write locks on them, so that other tasks
+ * can use them while we modify the leaf.
+ */
if (path->slots[0] == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
fixup_low_keys(path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -4730,6 +4017,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
if (btrfs_leaf_free_space(leaf) < total_size) {
btrfs_print_leaf(leaf);
@@ -4740,11 +4028,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+ unsigned int old_data = btrfs_item_data_end(leaf, slot);
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4755,35 +4044,33 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - total_data, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ data_end - batch->total_data_size,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ old_data - data_end);
data_end = old_data;
}
/* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(leaf, item,
- data_end - data_size[i], &token);
- data_end -= data_size[i];
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
+ data_end -= batch->data_sizes[i];
+ btrfs_set_token_item_offset(&token, slot + i, data_end);
+ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
}
- btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -4793,26 +4080,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
/*
+ * Insert a new item into a leaf.
+ *
+ * @root: The root of the btree.
+ * @path: A path pointing to the target leaf and slot.
+ * @key: The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ setup_items_for_insert(root, path, &batch);
+}
+
+/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+ const struct btrfs_item_batch *batch)
{
int ret = 0;
int slot;
- int i;
- u32 total_size = 0;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
+ u32 total_size;
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
if (ret == 0)
return -EEXIST;
if (ret < 0)
@@ -4821,8 +4125,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, batch);
return 0;
}
@@ -4854,6 +4157,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
* delete the pointer from a given node.
*
* the tree should have been previously balanced so the deletion does not
@@ -4869,8 +4206,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level) {
- ret = tree_mod_log_insert_move(parent, slot, slot + 1,
- nritems - slot - 1);
+ ret = btrfs_tree_mod_log_insert_move(parent, slot,
+ slot + 1, nritems - slot - 1);
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
@@ -4879,8 +4216,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
- ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, slot,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
BUG_ON(ret < 0);
}
@@ -4926,7 +4263,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
@@ -4938,25 +4275,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
- struct btrfs_item *item;
- u32 last_off;
- u32 dsize = 0;
int ret = 0;
int wret;
- int i;
u32 nritems;
leaf = path->nodes[0];
- last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
-
- for (i = 0; i < nr; i++)
- dsize += btrfs_item_size_nr(leaf, slot + i);
-
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(leaf);
+ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
+ const int data_end = leaf_data_end(leaf);
struct btrfs_map_token token;
+ u32 dsize = 0;
+ int i;
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size(leaf, slot + i);
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
@@ -4967,10 +4301,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + dsize, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -4986,7 +4318,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
- btrfs_set_path_blocking(path);
btrfs_clean_tree_block(leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
@@ -4999,25 +4330,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fixup_low_keys(path, &disk_key, 1);
}
- /* delete the leaf if it is mostly empty */
+ /*
+ * Try to delete the leaf if it is mostly empty. We do this by
+ * trying to move all its items into its left and right neighbours.
+ * If we can't move all the items, then we don't delete it - it's
+ * not ideal, but future insertions might fill the leaf with more
+ * items, or items from other leaves might be moved later into our
+ * leaf due to deletions on those leaves.
+ */
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
+ u32 min_push_space;
+
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
-
- btrfs_set_path_blocking(path);
- wret = push_leaf_left(trans, root, path, 1, 1,
- 1, (u32)-1);
+ /*
+ * We want to be able to at least push one item to the
+ * left neighbour leaf, and that's the first item.
+ */
+ min_push_space = sizeof(struct btrfs_item) +
+ btrfs_item_size(leaf, 0);
+ wret = push_leaf_left(trans, root, path, 0,
+ min_push_space, 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1,
- 1, 1, 0);
+ /*
+ * If we were not able to push all items from our
+ * leaf to its left neighbour, then attempt to
+ * either push all the remaining items to the
+ * right neighbour or none. There's no advantage
+ * in pushing only some items, instead of all, as
+ * it's pointless to end up with a leaf having
+ * too few items while the neighbours can be full
+ * or nearly full.
+ */
+ nritems = btrfs_header_nritems(leaf);
+ min_push_space = leaf_space_used(leaf, 0, nritems);
+ wret = push_leaf_right(trans, root, path, 0,
+ min_push_space, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
@@ -5126,6 +4482,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int ret = 1;
int keep_locks = path->keep_locks;
+ ASSERT(!path->nowait);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -5141,7 +4498,7 @@ again:
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
- sret = btrfs_bin_search(cur, min_key, level, &slot);
+ sret = btrfs_bin_search(cur, min_key, &slot);
if (sret < 0) {
ret = sret;
goto out;
@@ -5160,7 +4517,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
@@ -5179,7 +4536,6 @@ find_next_key:
*/
if (slot >= nritems) {
path->slots[level] = slot;
- btrfs_set_path_blocking(path);
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -5196,7 +4552,6 @@ find_next_key:
ret = 0;
goto out;
}
- btrfs_set_path_blocking(path);
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -5213,7 +4568,6 @@ out:
path->keep_locks = keep_locks;
if (ret == 0) {
btrfs_unlock_up_safe(path, path->lowest_level + 1);
- btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
}
return ret;
@@ -5295,16 +4649,6 @@ next:
return 1;
}
-/*
- * search the tree again to find a leaf with greater keys
- * returns 0 if it found something or 1 if there are no greater leaves.
- * returns < 0 on io errors.
- */
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
-{
- return btrfs_next_old_leaf(root, path, 0);
-}
-
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
@@ -5312,11 +4656,14 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int level;
struct extent_buffer *c;
struct extent_buffer *next;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
+ bool need_commit_sem = false;
u32 nritems;
int ret;
- int old_spinning = path->leave_spinning;
- int next_rw_lock = 0;
+ int i;
+
+ ASSERT(!path->nowait);
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
@@ -5326,20 +4673,24 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
again:
level = 1;
next = NULL;
- next_rw_lock = 0;
btrfs_release_path(path);
path->keep_locks = 1;
- path->leave_spinning = 1;
- if (time_seq)
+ if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
- else
+ } else {
+ if (path->need_commit_sem) {
+ path->need_commit_sem = 0;
+ need_commit_sem = true;
+ down_read(&fs_info->commit_root_sem);
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ }
path->keep_locks = 0;
if (ret < 0)
- return ret;
+ goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
@@ -5390,13 +4741,22 @@ again:
continue;
}
- if (next) {
- btrfs_tree_unlock_rw(next, next_rw_lock);
- free_extent_buffer(next);
+
+ /*
+ * Our current level is where we're going to start from, and to
+ * make sure lockdep doesn't complain we need to drop our locks
+ * and nodes from 0 to our current level.
+ */
+ for (i = 0; i < level; i++) {
+ if (path->locks[level]) {
+ btrfs_tree_read_unlock(path->nodes[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
}
next = c;
- next_rw_lock = path->locks[level];
ret = read_block_for_search(root, path, &next, level,
slot, &key);
if (ret == -EAGAIN)
@@ -5422,26 +4782,18 @@ again:
cond_resched();
goto again;
}
- if (!ret) {
- btrfs_set_path_blocking(path);
+ if (!ret)
btrfs_tree_read_lock(next);
- }
- next_rw_lock = BTRFS_READ_LOCK;
}
break;
}
path->slots[level] = slot;
while (1) {
level--;
- c = path->nodes[level];
- if (path->locks[level])
- btrfs_tree_unlock_rw(c, path->locks[level]);
-
- free_extent_buffer(c);
path->nodes[level] = next;
path->slots[level] = 0;
if (!path->skip_locking)
- path->locks[level] = next_rw_lock;
+ path->locks[level] = BTRFS_READ_LOCK;
if (!level)
break;
@@ -5455,21 +4807,21 @@ again:
goto done;
}
- if (!path->skip_locking) {
- ret = btrfs_try_tree_read_lock(next);
- if (!ret) {
- btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
- }
- next_rw_lock = BTRFS_READ_LOCK;
- }
+ if (!path->skip_locking)
+ btrfs_tree_read_lock(next);
}
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
- path->leave_spinning = old_spinning;
- if (!old_spinning)
- btrfs_set_path_blocking(path);
+ if (need_commit_sem) {
+ int ret2;
+
+ path->need_commit_sem = 1;
+ ret2 = finish_need_commit_sem_search(path);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
return ret;
}
@@ -5491,7 +4843,6 @@ int btrfs_previous_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
@@ -5533,7 +4884,6 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;