aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile21
-rw-r--r--fs/btrfs/acl.c20
-rw-r--r--fs/btrfs/async-thread.c67
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c915
-rw-r--r--fs/btrfs/ctree.h161
-rw-r--r--fs/btrfs/delayed-ref.c668
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c191
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c2111
-rw-r--r--fs/btrfs/extent_io.c234
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c18
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c145
-rw-r--r--fs/btrfs/free-space-cache.c535
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c394
-rw-r--r--fs/btrfs/ioctl.c60
-rw-r--r--fs/btrfs/locking.c25
-rw-r--r--fs/btrfs/ordered-data.c120
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/super.c86
-rw-r--r--fs/btrfs/transaction.c164
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c458
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/btrfs/volumes.c200
-rw-r--r--fs/btrfs/volumes.h18
35 files changed, 4309 insertions, 2622 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..94212844a9bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,25 +1,10 @@
-ifneq ($(KERNELRELEASE),)
-# kbuild part of makefile
obj-$(CONFIG_BTRFS_FS) := btrfs.o
-btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+
+btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
file-item.o inode-item.o inode-map.o disk-io.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
- compression.o
-else
-
-# Normal Makefile
-
-KERNELDIR := /lib/modules/`uname -r`/build
-all:
- $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
-
-modules_install:
- $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
-clean:
- $(MAKE) -C $(KERNELDIR) M=`pwd` clean
-
-endif
+ compression.o delayed-ref.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba5..cbba000dccbe 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
return ERR_PTR(-EINVAL);
}
+ /* Handle the cached NULL acl case without locking */
+ acl = ACCESS_ONCE(*p_acl);
+ if (!acl)
+ return acl;
+
spin_lock(&inode->i_lock);
- if (*p_acl != BTRFS_ACL_NOT_CACHED)
- acl = posix_acl_dup(*p_acl);
+ acl = *p_acl;
+ if (acl != BTRFS_ACL_NOT_CACHED)
+ acl = posix_acl_dup(acl);
spin_unlock(&inode->i_lock);
- if (acl)
+ if (acl != BTRFS_ACL_NOT_CACHED)
return acl;
-
size = __btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
value = kzalloc(size, GFP_NOFS);
@@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
btrfs_update_cached_acl(inode, p_acl, acl);
}
kfree(value);
- } else if (size == -ENOENT) {
+ } else if (size == -ENOENT || size == -ENODATA || size == 0) {
+ /* FIXME, who returns -ENOENT? I think nobody */
acl = NULL;
btrfs_update_cached_acl(inode, p_acl, acl);
+ } else {
+ acl = ERR_PTR(-EIO);
}
return acl;
@@ -256,7 +264,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
}
if (!acl)
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
}
if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..502c3d61de62 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,12 +20,12 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
-#include <linux/ftrace.h>
#include "async-thread.h"
#define WORK_QUEUED_BIT 0
#define WORK_DONE_BIT 1
#define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
/*
* container for the kthread task pointer and the list of pending work
@@ -37,6 +37,7 @@ struct btrfs_worker_thread {
/* list of struct btrfs_work that are waiting for service */
struct list_head pending;
+ struct list_head prio_pending;
/* list of worker threads from struct btrfs_workers */
struct list_head worker_list;
@@ -104,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
spin_lock_irqsave(&workers->lock, flags);
- while (!list_empty(&workers->order_list)) {
- work = list_entry(workers->order_list.next,
- struct btrfs_work, order_list);
-
+ while (1) {
+ if (!list_empty(&workers->prio_order_list)) {
+ work = list_entry(workers->prio_order_list.next,
+ struct btrfs_work, order_list);
+ } else if (!list_empty(&workers->order_list)) {
+ work = list_entry(workers->order_list.next,
+ struct btrfs_work, order_list);
+ } else {
+ break;
+ }
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
@@ -144,8 +151,14 @@ static int worker_loop(void *arg)
do {
spin_lock_irq(&worker->lock);
again_locked:
- while (!list_empty(&worker->pending)) {
- cur = worker->pending.next;
+ while (1) {
+ if (!list_empty(&worker->prio_pending))
+ cur = worker->prio_pending.next;
+ else if (!list_empty(&worker->pending))
+ cur = worker->pending.next;
+ else
+ break;
+
work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list);
clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -164,7 +177,6 @@ again_locked:
spin_lock_irq(&worker->lock);
check_idle_worker(worker);
-
}
if (freezing(current)) {
worker->working = 0;
@@ -179,7 +191,8 @@ again_locked:
* jump_in?
*/
smp_mb();
- if (!list_empty(&worker->pending))
+ if (!list_empty(&worker->pending) ||
+ !list_empty(&worker->prio_pending))
continue;
/*
@@ -192,13 +205,18 @@ again_locked:
*/
schedule_timeout(1);
smp_mb();
- if (!list_empty(&worker->pending))
+ if (!list_empty(&worker->pending) ||
+ !list_empty(&worker->prio_pending))
continue;
+ if (kthread_should_stop())
+ break;
+
/* still no more work?, sleep for real */
spin_lock_irq(&worker->lock);
set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&worker->pending))
+ if (!list_empty(&worker->pending) ||
+ !list_empty(&worker->prio_pending))
goto again_locked;
/*
@@ -208,7 +226,8 @@ again_locked:
worker->working = 0;
spin_unlock_irq(&worker->lock);
- schedule();
+ if (!kthread_should_stop())
+ schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -245,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
INIT_LIST_HEAD(&workers->worker_list);
INIT_LIST_HEAD(&workers->idle_list);
INIT_LIST_HEAD(&workers->order_list);
+ INIT_LIST_HEAD(&workers->prio_order_list);
spin_lock_init(&workers->lock);
workers->max_workers = max;
workers->idle_thresh = 32;
@@ -270,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
}
INIT_LIST_HEAD(&worker->pending);
+ INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
atomic_set(&worker->num_pending, 0);
@@ -393,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
goto out;
spin_lock_irqsave(&worker->lock, flags);
- list_add_tail(&work->list, &worker->pending);
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+ list_add_tail(&work->list, &worker->prio_pending);
+ else
+ list_add_tail(&work->list, &worker->pending);
atomic_inc(&worker->num_pending);
/* by definition we're busy, take ourselves off the idle
@@ -419,6 +443,11 @@ out:
return 0;
}
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+ set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
/*
* places a struct btrfs_work into the pending queue of one of the kthreads
*/
@@ -435,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
worker = find_worker(workers);
if (workers->ordered) {
spin_lock_irqsave(&workers->lock, flags);
- list_add_tail(&work->order_list, &workers->order_list);
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+ list_add_tail(&work->order_list,
+ &workers->prio_order_list);
+ } else {
+ list_add_tail(&work->order_list, &workers->order_list);
+ }
spin_unlock_irqrestore(&workers->lock, flags);
} else {
INIT_LIST_HEAD(&work->order_list);
@@ -443,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
spin_lock_irqsave(&worker->lock, flags);
- list_add_tail(&work->list, &worker->pending);
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+ list_add_tail(&work->list, &worker->prio_pending);
+ else
+ list_add_tail(&work->list, &worker->pending);
atomic_inc(&worker->num_pending);
check_busy_worker(worker);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 31be4ed8b63e..1b511c109db6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
* of work items waiting for completion
*/
struct list_head order_list;
+ struct list_head prio_order_list;
/* lock for finding the next worker thread to queue on */
spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
+ /*
+ * list for tracking inodes that must be sent to disk before a
+ * rename or truncate commit
+ */
+ struct list_head ordered_operations;
+
/* the space_info for where this inode's data allocations are done */
struct btrfs_space_info *space_info;
@@ -86,12 +92,6 @@ struct btrfs_inode {
*/
u64 logged_trans;
- /*
- * trans that last made a change that should be fully fsync'd. This
- * gets reset to zero each time the inode is logged
- */
- u64 log_dirty_trans;
-
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
*/
@@ -121,6 +121,25 @@ struct btrfs_inode {
/* the start of block group preferred for allocations. */
u64 block_group;
+ /* the fsync log has some corner cases that mean we have to check
+ * directories to see if any unlinks have been done before
+ * the directory was logged. See tree-log.c for all the
+ * details
+ */
+ u64 last_unlink_trans;
+
+ /*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ *
+ * yes, its silly to have a single bitflag, but we might grow more
+ * of these.
+ */
+ unsigned ordered_data_close:1;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529aa..a99f1c2a710d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
* empty_size -- a hint that you plan on doing more cow. This is the size in
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
*/
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size,
- u64 prealloc_dest)
+ u64 search_start, u64 empty_size)
{
u64 parent_start;
struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
level = btrfs_header_level(buf);
nritems = btrfs_header_nritems(buf);
- if (prealloc_dest) {
- struct btrfs_key ins;
-
- ins.objectid = prealloc_dest;
- ins.offset = buf->len;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
- ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
- root->root_key.objectid,
- trans->transid, level, &ins);
- BUG_ON(ret);
- cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
- buf->len, level);
- } else {
- cow = btrfs_alloc_free_block(trans, root, buf->len,
- parent_start,
- root->root_key.objectid,
- trans->transid, level,
- search_start, empty_size);
- }
+ cow = btrfs_alloc_free_block(trans, root, buf->len,
+ parent_start, root->root_key.objectid,
+ trans->transid, level,
+ search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret, u64 prealloc_dest)
+ struct extent_buffer **cow_ret)
{
u64 search_start;
int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_header_owner(buf) == root->root_key.objectid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
*cow_ret = buf;
- WARN_ON(prealloc_dest);
return 0;
}
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0,
- prealloc_dest);
+ parent_slot, cow_ret, search_start, 0);
return ret;
}
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize), 0);
+ (end_slot - i) * blocksize));
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(!child);
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
BUG_ON(ret);
spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, child->start,
+ child->len,
mid->start, child->start,
root->root_key.objectid,
trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
+ if (trans->transaction->delayed_refs.flushing &&
+ btrfs_header_nritems(mid) > 2)
+ return 0;
+
if (btrfs_header_nritems(mid) < 2)
err_on_enospc = 1;
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left, 0);
+ parent, pslot - 1, &left);
if (wret) {
ret = wret;
goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right, 0);
+ parent, pslot + 1, &right);
if (wret) {
ret = wret;
goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left, 0);
+ pslot - 1, &left);
if (ret)
wret = 1;
else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right, 0);
+ &right);
if (ret)
wret = 1;
else {
@@ -1262,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
* readahead one full node of leaves, finding things that are close
* to the block in 'slot', and triggering ra on them.
*/
-static noinline void reada_for_search(struct btrfs_root *root,
- struct btrfs_path *path,
- int level, int slot, u64 objectid)
+static void reada_for_search(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int level, int slot, u64 objectid)
{
struct extent_buffer *node;
struct btrfs_disk_key disk_key;
@@ -1343,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
int ret = 0;
int blocksize;
- parent = path->nodes[level - 1];
+ parent = path->nodes[level + 1];
if (!parent)
return 0;
nritems = btrfs_header_nritems(parent);
- slot = path->slots[level];
+ slot = path->slots[level + 1];
blocksize = btrfs_level_size(root, level);
if (slot > 0) {
@@ -1359,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
block1 = 0;
free_extent_buffer(eb);
}
- if (slot < nritems) {
+ if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1369,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
}
if (block1 || block2) {
ret = -EAGAIN;
+
+ /* release the whole path */
btrfs_release_path(root, path);
+
+ /* read the blocks */
if (block1)
readahead_tree_block(root, block1, blocksize, 0);
if (block2)
@@ -1379,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
eb = read_tree_block(root, block1, blocksize, 0);
free_extent_buffer(eb);
}
- if (block1) {
+ if (block2) {
eb = read_tree_block(root, block2, blocksize, 0);
free_extent_buffer(eb);
}
@@ -1465,6 +1451,120 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
}
/*
+ * helper function for btrfs_search_slot. The goal is to find a block
+ * in cache without setting the path to blocking. If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada. -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer **eb_ret, int level, int slot,
+ struct btrfs_key *key)
+{
+ u64 blocknr;
+ u64 gen;
+ u32 blocksize;
+ struct extent_buffer *b = *eb_ret;
+ struct extent_buffer *tmp;
+
+ blocknr = btrfs_node_blockptr(b, slot);
+ gen = btrfs_node_ptr_generation(b, slot);
+ blocksize = btrfs_level_size(root, level - 1);
+
+ tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+ if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ *eb_ret = tmp;
+ return 0;
+ }
+
+ /*
+ * reduce lock contention at high levels
+ * of the btree by dropping locks before
+ * we read.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+ btrfs_set_path_blocking(p);
+
+ if (tmp)
+ free_extent_buffer(tmp);
+ if (p->reada)
+ reada_for_search(root, p, level, slot, key->objectid);
+
+ btrfs_release_path(NULL, p);
+ tmp = read_tree_block(root, blocknr, blocksize, gen);
+ if (tmp)
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+}
+
+/*
+ * helper function for btrfs_search_slot. This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned. If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p,
+ struct extent_buffer *b, int level, int ins_len)
+{
+ int ret;
+ if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+ BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+ int sret;
+
+ sret = reada_for_balance(root, p, level);
+ if (sret)
+ goto again;
+
+ btrfs_set_path_blocking(p);
+ sret = split_node(trans, root, p, level);
+ btrfs_clear_path_blocking(p, NULL);
+
+ BUG_ON(sret > 0);
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ } else if (ins_len < 0 && btrfs_header_nritems(b) <
+ BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+ int sret;
+
+ sret = reada_for_balance(root, p, level);
+ if (sret)
+ goto again;
+
+ btrfs_set_path_blocking(p);
+ sret = balance_level(trans, root, p, level);
+ btrfs_clear_path_blocking(p, NULL);
+
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ if (!b) {
+ btrfs_release_path(NULL, p);
+ goto again;
+ }
+ BUG_ON(btrfs_header_nritems(b) == 1);
+ }
+ return 0;
+
+again:
+ ret = -EAGAIN;
+done:
+ return ret;
+}
+
+/*
* look for key in the tree. path is filled in with nodes along the way
* if key is found, we return zero and you can find the item in the leaf
* level of the path (level 0)
@@ -1482,17 +1582,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
ins_len, int cow)
{
struct extent_buffer *b;
- struct extent_buffer *tmp;
int slot;
int ret;
int level;
- int should_reada = p->reada;
int lowest_unlock = 1;
- int blocksize;
u8 lowest_level = 0;
- u64 blocknr;
- u64 gen;
- struct btrfs_key prealloc_block;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1595,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (ins_len < 0)
lowest_unlock = 2;
- prealloc_block.objectid = 0;
-
again:
if (p->skip_locking)
b = btrfs_root_node(root);
@@ -1523,50 +1615,21 @@ again:
if (cow) {
int wret;
- /* is a cow on this block not required */
+ /*
+ * if we don't really need to cow this block
+ * then we don't want to set the path blocking,
+ * so we test it here
+ */
if (btrfs_header_generation(b) == trans->transid &&
btrfs_header_owner(b) == root->root_key.objectid &&
!btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
goto cow_done;
}
-
- /* ok, we have to cow, is our old prealloc the right
- * size?
- */
- if (prealloc_block.objectid &&
- prealloc_block.offset != b->len) {
- btrfs_release_path(root, p);
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- prealloc_block.objectid = 0;
- goto again;
- }
-
- /*
- * for higher level blocks, try not to allocate blocks
- * with the block and the parent locks held.
- */
- if (level > 0 && !prealloc_block.objectid) {
- u32 size = b->len;
- u64 hint = b->start;
-
- btrfs_release_path(root, p);
- ret = btrfs_reserve_extent(trans, root,
- size, size, 0,
- hint, (u64)-1,
- &prealloc_block, 0);
- BUG_ON(ret);
- goto again;
- }
-
btrfs_set_path_blocking(p);
wret = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1],
- &b, prealloc_block.objectid);
- prealloc_block.objectid = 0;
+ p->slots[level + 1], &b);
if (wret) {
free_extent_buffer(b);
ret = wret;
@@ -1611,51 +1674,15 @@ cow_done:
if (ret && slot > 0)
slot -= 1;
p->slots[level] = slot;
- if ((p->search_for_split || ins_len > 0) &&
- btrfs_header_nritems(b) >=
- BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
- int sret;
-
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
- btrfs_set_path_blocking(p);
- sret = split_node(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL);
-
- BUG_ON(sret > 0);
- if (sret) {
- ret = sret;
- goto done;
- }
- b = p->nodes[level];
- slot = p->slots[level];
- } else if (ins_len < 0 &&
- btrfs_header_nritems(b) <
- BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
- int sret;
-
- sret = reada_for_balance(root, p, level);
- if (sret)
- goto again;
-
- btrfs_set_path_blocking(p);
- sret = balance_level(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL);
+ ret = setup_nodes_for_search(trans, root, p, b, level,
+ ins_len);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ goto done;
+ b = p->nodes[level];
+ slot = p->slots[level];
- if (sret) {
- ret = sret;
- goto done;
- }
- b = p->nodes[level];
- if (!b) {
- btrfs_release_path(NULL, p);
- goto again;
- }
- slot = p->slots[level];
- BUG_ON(btrfs_header_nritems(b) == 1);
- }
unlock_up(p, level, lowest_unlock);
/* this is only true while dropping a snapshot */
@@ -1664,44 +1691,11 @@ cow_done:
goto done;
}
- blocknr = btrfs_node_blockptr(b, slot);
- gen = btrfs_node_ptr_generation(b, slot);
- blocksize = btrfs_level_size(root, level - 1);
+ ret = read_block_for_search(trans, root, p,
+ &b, level, slot, key);
+ if (ret == -EAGAIN)
+ goto again;
- tmp = btrfs_find_tree_block(root, blocknr, blocksize);
- if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
- b = tmp;
- } else {
- /*
- * reduce lock contention at high levels
- * of the btree by dropping locks before
- * we read.
- */
- if (level > 0) {
- btrfs_release_path(NULL, p);
- if (tmp)
- free_extent_buffer(tmp);
- if (should_reada)
- reada_for_search(root, p,
- level, slot,
- key->objectid);
-
- tmp = read_tree_block(root, blocknr,
- blocksize, gen);
- if (tmp)
- free_extent_buffer(tmp);
- goto again;
- } else {
- btrfs_set_path_blocking(p);
- if (tmp)
- free_extent_buffer(tmp);
- if (should_reada)
- reada_for_search(root, p,
- level, slot,
- key->objectid);
- b = read_node_slot(root, b, slot);
- }
- }
if (!p->skip_locking) {
int lret;
@@ -1742,12 +1736,8 @@ done:
* we don't really know what they plan on doing with the path
* from here on, so for now just mark it as blocking
*/
- btrfs_set_path_blocking(p);
- if (prealloc_block.objectid) {
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- }
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
return ret;
}
@@ -1768,7 +1758,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
int ret;
eb = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+ ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
BUG_ON(ret);
btrfs_set_lock_blocking(eb);
@@ -1826,7 +1816,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
}
ret = btrfs_cow_block(trans, root, eb, parent, slot,
- &eb, 0);
+ &eb);
BUG_ON(ret);
if (root->root_key.objectid ==
@@ -2139,7 +2129,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, lower->start,
- lower->start, c->start,
+ lower->len, lower->start, c->start,
root->root_key.objectid,
trans->transid, level - 1);
BUG_ON(ret);
@@ -2174,8 +2164,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
BUG_ON(!path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
- if (slot > nritems)
- BUG();
+ BUG_ON(slot > nritems);
if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
BUG();
if (slot != nritems) {
@@ -2221,7 +2210,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
- } else {
+ } else if (!trans->transaction->delayed_refs.flushing) {
ret = push_nodes_for_insert(trans, root, path, level);
c = path->nodes[level];
if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2318,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
return ret;
}
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes. returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size, int empty,
+ struct extent_buffer *right,
+ int free_space, u32 left_nritems)
{
struct extent_buffer *left = path->nodes[0];
- struct extent_buffer *right;
- struct extent_buffer *upper;
+ struct extent_buffer *upper = path->nodes[1];
struct btrfs_disk_key disk_key;
int slot;
u32 i;
- int free_space;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
- u32 left_nritems;
u32 nr;
u32 right_nritems;
u32 data_end;
u32 this_item_size;
int ret;
- slot = path->slots[1];
- if (!path->nodes[1])
- return 1;
-
- upper = path->nodes[1];
- if (slot >= btrfs_header_nritems(upper) - 1)
- return 1;
-
- btrfs_assert_tree_locked(path->nodes[1]);
-
- right = read_node_slot(root, upper, slot + 1);
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- /* cow and double check */
- ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right, 0);
- if (ret)
- goto out_unlock;
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- left_nritems = btrfs_header_nritems(left);
- if (left_nritems == 0)
- goto out_unlock;
-
if (empty)
nr = 0;
else
@@ -2397,6 +2347,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (path->slots[0] >= left_nritems)
push_space += data_size;
+ slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
item = btrfs_item_nr(left, i);
@@ -2528,24 +2479,82 @@ out_unlock:
}
/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ int slot;
+ int free_space;
+ u32 left_nritems;
+ int ret;
+
+ if (!path->nodes[1])
+ return 1;
+
+ slot = path->slots[1];
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ right = read_node_slot(root, upper, slot + 1);
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right);
+ if (ret)
+ goto out_unlock;
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ return __push_leaf_right(trans, root, path, data_size, empty,
+ right, free_space, left_nritems);
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
*/
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int data_size,
+ int empty, struct extent_buffer *left,
+ int free_space, int right_nritems)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
- struct extent_buffer *left;
int slot;
int i;
- int free_space;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
u32 old_left_nritems;
- u32 right_nritems;
u32 nr;
int ret = 0;
int wret;
@@ -2553,41 +2562,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
u32 old_left_item_size;
slot = path->slots[1];
- if (slot == 0)
- return 1;
- if (!path->nodes[1])
- return 1;
-
- right_nritems = btrfs_header_nritems(right);
- if (right_nritems == 0)
- return 1;
-
- btrfs_assert_tree_locked(path->nodes[1]);
-
- left = read_node_slot(root, path->nodes[1], slot - 1);
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
-
- free_space = btrfs_leaf_free_space(root, left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
-
- /* cow and double check */
- ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left, 0);
- if (ret) {
- /* we hit -ENOSPC, but it isn't fatal here */
- ret = 1;
- goto out;
- }
-
- free_space = btrfs_leaf_free_space(root, left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
if (empty)
nr = right_nritems;
@@ -2755,6 +2729,154 @@ out:
}
/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *right = path->nodes[0];
+ struct extent_buffer *left;
+ int slot;
+ int free_space;
+ u32 right_nritems;
+ int ret = 0;
+
+ slot = path->slots[1];
+ if (slot == 0)
+ return 1;
+ if (!path->nodes[1])
+ return 1;
+
+ right_nritems = btrfs_header_nritems(right);
+ if (right_nritems == 0)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ left = read_node_slot(root, path->nodes[1], slot - 1);
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left);
+ if (ret) {
+ /* we hit -ENOSPC, but it isn't fatal here */
+ ret = 1;
+ goto out;
+ }
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ return __push_leaf_left(trans, root, path, data_size,
+ empty, left, free_space, right_nritems);
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
+{
+ int data_copy_size;
+ int rt_data_off;
+ int i;
+ int ret = 0;
+ int wret;
+ struct btrfs_disk_key disk_key;
+
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+ nritems * sizeof(struct btrfs_item));
+
+ copy_extent_buffer(right, l,
+ btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+ data_copy_size, btrfs_leaf_data(l) +
+ leaf_data_end(root, l), data_copy_size);
+
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_end_nr(l, mid);
+
+ for (i = 0; i < nritems; i++) {
+ struct btrfs_item *item = btrfs_item_nr(right, i);
+ u32 ioff;
+
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(right, item);
+ btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ btrfs_set_header_nritems(l, mid);
+ ret = 0;
+ btrfs_item_key(right, &disk_key, 0);
+ wret = insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ if (wret)
+ ret = wret;
+
+ btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(l);
+ BUG_ON(path->slots[0] != slot);
+
+ ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+ BUG_ON(ret);
+
+ if (mid <= slot) {
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] -= mid;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+
+ BUG_ON(path->slots[0] < 0);
+
+ return ret;
+}
+
+/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*
@@ -2771,17 +2893,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int mid;
int slot;
struct extent_buffer *right;
- int data_copy_size;
- int rt_data_off;
- int i;
int ret = 0;
int wret;
int double_split;
int num_doubles = 0;
- struct btrfs_disk_key disk_key;
/* first try to make some room by pushing left and right */
- if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+ !trans->transaction->delayed_refs.flushing) {
wret = push_leaf_right(trans, root, path, data_size, 0);
if (wret < 0)
return wret;
@@ -2830,11 +2949,14 @@ again:
write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
(unsigned long)btrfs_header_chunk_tree_uuid(right),
BTRFS_UUID_SIZE);
+
if (mid <= slot) {
if (nritems == 1 ||
leaf_space_used(l, mid, nritems - mid) + data_size >
BTRFS_LEAF_DATA_SIZE(root)) {
if (slot >= nritems) {
+ struct btrfs_disk_key disk_key;
+
btrfs_cpu_key_to_disk(&disk_key, ins_key);
btrfs_set_header_nritems(right, 0);
wret = insert_ptr(trans, root, path,
@@ -2862,6 +2984,8 @@ again:
if (leaf_space_used(l, 0, mid) + data_size >
BTRFS_LEAF_DATA_SIZE(root)) {
if (!extend && data_size && slot == 0) {
+ struct btrfs_disk_key disk_key;
+
btrfs_cpu_key_to_disk(&disk_key, ins_key);
btrfs_set_header_nritems(right, 0);
wret = insert_ptr(trans, root, path,
@@ -2894,76 +3018,16 @@ again:
}
}
}
- nritems = nritems - mid;
- btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-
- copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(mid),
- nritems * sizeof(struct btrfs_item));
-
- copy_extent_buffer(right, l,
- btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
- data_copy_size, btrfs_leaf_data(l) +
- leaf_data_end(root, l), data_copy_size);
-
- rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
- btrfs_item_end_nr(l, mid);
-
- for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(right, i);
- u32 ioff;
-
- if (!right->map_token) {
- map_extent_buffer(right, (unsigned long)item,
- sizeof(struct btrfs_item),
- &right->map_token, &right->kaddr,
- &right->map_start, &right->map_len,
- KM_USER1);
- }
-
- ioff = btrfs_item_offset(right, item);
- btrfs_set_item_offset(right, item, ioff + rt_data_off);
- }
-
- if (right->map_token) {
- unmap_extent_buffer(right, right->map_token, KM_USER1);
- right->map_token = NULL;
- }
- btrfs_set_header_nritems(l, mid);
- ret = 0;
- btrfs_item_key(right, &disk_key, 0);
- wret = insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
-
- btrfs_mark_buffer_dirty(right);
- btrfs_mark_buffer_dirty(l);
- BUG_ON(path->slots[0] != slot);
-
- ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+ ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
BUG_ON(ret);
- if (mid <= slot) {
- btrfs_tree_unlock(path->nodes[0]);
- free_extent_buffer(path->nodes[0]);
- path->nodes[0] = right;
- path->slots[0] -= mid;
- path->slots[1] += 1;
- } else {
- btrfs_tree_unlock(right);
- free_extent_buffer(right);
- }
-
- BUG_ON(path->slots[0] < 0);
-
if (double_split) {
BUG_ON(num_doubles != 0);
num_doubles++;
goto again;
}
+
return ret;
}
@@ -3021,26 +3085,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
return -EAGAIN;
}
+ btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &orig_key, path,
sizeof(struct btrfs_item), 1);
path->keep_locks = 0;
BUG_ON(ret);
+ btrfs_unlock_up_safe(path, 1);
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
/*
* make sure any changes to the path from split_leaf leave it
* in a blocking state
*/
btrfs_set_path_blocking(path);
- leaf = path->nodes[0];
- BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-
-split:
item = btrfs_item_nr(leaf, path->slots[0]);
orig_offset = btrfs_item_offset(leaf, item);
item_size = btrfs_item_size(leaf, item);
-
buf = kmalloc(item_size, GFP_NOFS);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
path->slots[0]), item_size);
@@ -3445,39 +3510,27 @@ out:
}
/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
*/
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+static noinline_for_stack int
+setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
{
- struct extent_buffer *leaf;
struct btrfs_item *item;
- int ret = 0;
- int slot;
- int slot_orig;
int i;
u32 nritems;
- u32 total_size = 0;
- u32 total_data = 0;
unsigned int data_end;
struct btrfs_disk_key disk_key;
+ int ret;
+ struct extent_buffer *leaf;
+ int slot;
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
-
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
- if (ret == 0)
- return -EEXIST;
- if (ret < 0)
- goto out;
-
- slot_orig = path->slots[0];
leaf = path->nodes[0];
+ slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3542,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
BUG();
}
- slot = path->slots[0];
- BUG_ON(slot < 0);
-
if (slot != nritems) {
unsigned int old_data = btrfs_item_end_nr(leaf, slot);
@@ -3547,21 +3597,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
data_end -= data_size[i];
btrfs_set_item_size(leaf, item, data_size[i]);
}
+
btrfs_set_header_nritems(leaf, nritems + nr);
- btrfs_mark_buffer_dirty(leaf);
ret = 0;
if (slot == 0) {
+ struct btrfs_disk_key disk_key;
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
ret = fixup_low_keys(trans, root, path, &disk_key, 1);
}
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(root, leaf) < 0) {
btrfs_print_leaf(root, leaf);
BUG();
}
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ struct extent_buffer *leaf;
+ int ret = 0;
+ int slot;
+ int i;
+ u32 total_size = 0;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ total_data, total_size, nr);
+
out:
- btrfs_unlock_up_safe(path, 1);
return ret;
}
@@ -3749,7 +3838,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/* delete the leaf if it is mostly empty */
- if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+ !trans->transaction->delayed_refs.flushing) {
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
@@ -3757,6 +3847,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
slot = path->slots[1];
extent_buffer_get(leaf);
+ btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
@@ -4042,28 +4133,44 @@ next:
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
int slot;
- int level = 1;
+ int level;
struct extent_buffer *c;
- struct extent_buffer *next = NULL;
+ struct extent_buffer *next;
struct btrfs_key key;
u32 nritems;
int ret;
+ int old_spinning = path->leave_spinning;
+ int force_blocking = 0;
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
return 1;
- btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+ /*
+ * we take the blocks in an order that upsets lockdep. Using
+ * blocking mode is the only way around it.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ force_blocking = 1;
+#endif
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+ level = 1;
+ next = NULL;
btrfs_release_path(root, path);
+
path->keep_locks = 1;
+
+ if (!force_blocking)
+ path->leave_spinning = 1;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->keep_locks = 0;
if (ret < 0)
return ret;
- btrfs_set_path_blocking(path);
nritems = btrfs_header_nritems(path->nodes[0]);
/*
* by releasing the path above we dropped all our locks. A balance
@@ -4073,19 +4180,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
*/
if (nritems > 0 && path->slots[0] < nritems - 1) {
path->slots[0]++;
+ ret = 0;
goto done;
}
while (level < BTRFS_MAX_LEVEL) {
- if (!path->nodes[level])
- return 1;
+ if (!path->nodes[level]) {
+ ret = 1;
+ goto done;
+ }
slot = path->slots[level] + 1;
c = path->nodes[level];
if (slot >= btrfs_header_nritems(c)) {
level++;
- if (level == BTRFS_MAX_LEVEL)
- return 1;
+ if (level == BTRFS_MAX_LEVEL) {
+ ret = 1;
+ goto done;
+ }
continue;
}
@@ -4094,16 +4206,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
free_extent_buffer(next);
}
- /* the path was set to blocking above */
- if (level == 1 && (path->locks[1] || path->skip_locking) &&
- path->reada)
- reada_for_search(root, path, level, slot, 0);
+ next = c;
+ ret = read_block_for_search(NULL, root, path, &next, level,
+ slot, &key);
+ if (ret == -EAGAIN)
+ goto again;
- next = read_node_slot(root, c, slot);
if (!path->skip_locking) {
- btrfs_assert_tree_locked(c);
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ ret = btrfs_try_spin_lock(next);
+ if (!ret) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_lock(next);
+ if (!force_blocking)
+ btrfs_clear_path_blocking(path, next);
+ }
+ if (force_blocking)
+ btrfs_set_lock_blocking(next);
}
break;
}
@@ -4113,27 +4231,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
c = path->nodes[level];
if (path->locks[level])
btrfs_tree_unlock(c);
+
free_extent_buffer(c);
path->nodes[level] = next;
path->slots[level] = 0;
if (!path->skip_locking)
path->locks[level] = 1;
+
if (!level)
break;
- btrfs_set_path_blocking(path);
- if (level == 1 && path->locks[1] && path->reada)
- reada_for_search(root, path, level, slot, 0);
- next = read_node_slot(root, next, 0);
+ ret = read_block_for_search(NULL, root, path, &next, level,
+ 0, &key);
+ if (ret == -EAGAIN)
+ goto again;
+
if (!path->skip_locking) {
btrfs_assert_tree_locked(path->nodes[level]);
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ ret = btrfs_try_spin_lock(next);
+ if (!ret) {
+ btrfs_set_path_blocking(path);
+ btrfs_tree_lock(next);
+ if (!force_blocking)
+ btrfs_clear_path_blocking(path, next);
+ }
+ if (force_blocking)
+ btrfs_set_lock_blocking(next);
}
}
+ ret = 0;
done:
unlock_up(path, 0, 1);
- return 0;
+ path->leave_spinning = old_spinning;
+ if (!old_spinning)
+ btrfs_set_path_blocking(path);
+
+ return ret;
}
/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d8..4414a5d9983a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
#define BTRFS_MAX_LEVEL 8
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list. That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -136,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
#define BTRFS_FT_MAX 9
/*
- * the key defines the order in the tree, and so it also defines (optimal)
- * block layout. objectid corresonds to the inode number. The flags
- * tells us things about the object, and is a kind of stream selector.
- * so for a given inode, keys with flags of 1 might refer to the inode
- * data, flags of 2 may point to file data in the btree and flags == 3
- * may point to extents.
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
*
* offset is the starting byte offset for this key in the stream.
*
@@ -193,7 +203,7 @@ struct btrfs_dev_item {
/*
* starting byte of this partition on the device,
- * to allowr for stripe alignment in the future
+ * to allow for stripe alignment in the future
*/
__le64 start_offset;
@@ -401,15 +411,16 @@ struct btrfs_path {
int locks[BTRFS_MAX_LEVEL];
int reada;
/* keep some upper locks as we walk down */
- int keep_locks;
- int skip_locking;
int lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
* and to force calls to keep space in the nodes
*/
- int search_for_split;
+ unsigned int search_for_split:1;
+ unsigned int keep_locks:1;
+ unsigned int skip_locking:1;
+ unsigned int leave_spinning:1;
};
/*
@@ -625,18 +636,35 @@ struct btrfs_space_info {
struct rw_semaphore groups_sem;
};
-struct btrfs_free_space {
- struct rb_node bytes_index;
- struct rb_node offset_index;
- u64 offset;
- u64 bytes;
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes. They are used for all metadata
+ * allocations and data allocations in ssd mode.
+ */
+struct btrfs_free_cluster {
+ spinlock_t lock;
+ spinlock_t refill_lock;
+ struct rb_root root;
+
+ /* largest extent in this cluster */
+ u64 max_size;
+
+ /* first extent starting offset */
+ u64 window_start;
+
+ struct btrfs_block_group_cache *block_group;
+ /*
+ * when a cluster is allocated from a block group, we put the
+ * cluster onto a list in the block group so that it can
+ * be freed before the block group is freed.
+ */
+ struct list_head block_group_list;
};
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
spinlock_t lock;
- struct mutex alloc_mutex;
struct mutex cache_mutex;
u64 pinned;
u64 reserved;
@@ -648,6 +676,7 @@ struct btrfs_block_group_cache {
struct btrfs_space_info *space_info;
/* free space cache stuff */
+ spinlock_t tree_lock;
struct rb_root free_space_bytes;
struct rb_root free_space_offset;
@@ -659,6 +688,11 @@ struct btrfs_block_group_cache {
/* usage count */
atomic_t count;
+
+ /* List of struct btrfs_free_clusters for this block group.
+ * Today it will only have one thing on it, but that may change
+ */
+ struct list_head cluster_list;
};
struct btrfs_leaf_ref_tree {
@@ -688,15 +722,18 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
struct extent_io_tree pinned_extents;
- struct extent_io_tree pending_del;
- struct extent_io_tree extent_ins;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
u64 generation;
u64 last_trans_committed;
- u64 last_trans_new_blockgroup;
+
+ /*
+ * this is updated to the current trans every time a full commit
+ * is required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
u64 open_ioctl_trans;
unsigned long mount_opt;
u64 max_extent;
@@ -717,12 +754,20 @@ struct btrfs_fs_info {
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
- struct mutex extent_ins_mutex;
- struct mutex pinned_mutex;
struct mutex chunk_mutex;
struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;
+
+ /*
+ * this protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make
+ * sure the commit code doesn't find the list temporarily empty
+ * because another function happens to be doing non-waiting preflush
+ * before jumping into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
@@ -737,10 +782,29 @@ struct btrfs_fs_info {
* ordered extents
*/
spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
struct list_head ordered_extents;
+
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
struct list_head delalloc_inodes;
/*
+ * special rename and truncate targets that must be on disk before
+ * we're allowed to commit. This is basically the ext3 style
+ * data=ordered list.
+ */
+ struct list_head ordered_operations;
+
+ /*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming after reads. This is because readers
* can run with FS locks held, and the writers may be waiting for
@@ -781,6 +845,11 @@ struct btrfs_fs_info {
atomic_t throttle_gen;
u64 total_pinned;
+
+ /* protected by the delalloc lock, used to keep from writing
+ * metadata until there is a nice batch
+ */
+ u64 dirty_metadata_bytes;
struct list_head dirty_cowonly_roots;
struct btrfs_fs_devices *fs_devices;
@@ -795,8 +864,12 @@ struct btrfs_fs_info {
spinlock_t delalloc_lock;
spinlock_t new_trans_lock;
u64 delalloc_bytes;
- u64 last_alloc;
- u64 last_data_alloc;
+
+ /* data_alloc_cluster is only used in ssd mode */
+ struct btrfs_free_cluster data_alloc_cluster;
+
+ /* all metadata allocations go through this cluster */
+ struct btrfs_free_cluster meta_alloc_cluster;
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
@@ -808,6 +881,9 @@ struct btrfs_fs_info {
u64 metadata_alloc_profile;
u64 system_alloc_profile;
+ unsigned data_chunk_allocations;
+ unsigned metadata_ratio;
+
void *bdev_holder;
};
@@ -888,7 +964,6 @@ struct btrfs_root {
};
/*
-
* inode items have the data typically returned from stat and store other
* info about object characteristics. There is one for every file and dir in
* the FS
@@ -919,7 +994,7 @@ struct btrfs_root {
#define BTRFS_EXTENT_CSUM_KEY 128
/*
- * root items point to tree roots. There are typically in the root
+ * root items point to tree roots. They are typically in the root
* tree used by the super block to find all the other trees
*/
#define BTRFS_ROOT_ITEM_KEY 132
@@ -966,6 +1041,8 @@ struct btrfs_root {
#define BTRFS_MOUNT_SSD (1 << 3)
#define BTRFS_MOUNT_DEGRADED (1 << 4)
#define BTRFS_MOUNT_COMPRESS (1 << 5)
+#define BTRFS_MOUNT_NOTREELOG (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1704,18 +1781,16 @@ static inline struct dentry *fdentry(struct file *file)
}
/* extent-tree.c */
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u32 *refs);
int btrfs_update_pinned_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int pin);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
@@ -1777,7 +1852,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 ref_generation,
u64 owner_objectid);
int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
+ struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 orig_parent, u64 parent,
u64 root_objectid, u64 ref_generation,
u64 owner_objectid);
@@ -1838,7 +1913,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret, u64 prealloc_dest);
+ struct extent_buffer **cow_ret);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2060,7 +2135,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
unsigned long btrfs_force_ra(struct address_space *mapping,
struct file_ra_state *ra, struct file *file,
pgoff_t offset, pgoff_t last_index);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_delete_inode(struct inode *inode);
void btrfs_put_inode(struct inode *inode);
@@ -2102,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
extern struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+ u64 start, u64 end, u64 locked_end,
+ u64 inline_limit, u64 *hint_block);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end);
@@ -2133,21 +2209,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
int btrfs_init_acl(struct inode *inode, struct inode *dir);
int btrfs_acl_chmod(struct inode *inode);
-/* free-space-cache.c */
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
- u64 bytenr, u64 size);
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes);
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
- u64 bytenr, u64 size);
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
- *block_group);
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
- *block_group, u64 offset,
- u64 bytes);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
- u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..d6c01c096a40
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking. For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing. This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 parent)
+{
+ if (bytenr < ref->bytenr)
+ return -1;
+ if (bytenr > ref->bytenr)
+ return 1;
+ if (parent < ref->parent)
+ return -1;
+ if (parent > ref->parent)
+ return 1;
+ return 0;
+}
+
+/*
+ * insert a new ref into the rbtree. This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+ u64 bytenr, u64 parent,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_node *entry;
+ int cmp;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ rb_node);
+
+ cmp = comp_entry(entry, bytenr, parent);
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * find an entry based on (bytenr,parent). This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+ u64 bytenr, u64 parent,
+ struct btrfs_delayed_ref_node **last)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int cmp;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+ if (last)
+ *last = entry;
+
+ cmp = comp_entry(entry, bytenr, parent);
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ assert_spin_locked(&delayed_refs->lock);
+ if (mutex_trylock(&head->mutex))
+ return 0;
+
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ spin_lock(&delayed_refs->lock);
+ if (!head->node.in_tree) {
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 start)
+{
+ int count = 0;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ if (start == 0) {
+ node = rb_first(&delayed_refs->root);
+ } else {
+ ref = NULL;
+ tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+ if (ref) {
+ struct btrfs_delayed_ref_node *tmp;
+
+ node = rb_prev(&ref->rb_node);
+ while (node) {
+ tmp = rb_entry(node,
+ struct btrfs_delayed_ref_node,
+ rb_node);
+ if (tmp->bytenr < start)
+ break;
+ ref = tmp;
+ node = rb_prev(&ref->rb_node);
+ }
+ node = &ref->rb_node;
+ } else
+ node = rb_first(&delayed_refs->root);
+ }
+again:
+ while (node && count < 32) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (list_empty(&head->cluster)) {
+ list_add_tail(&head->cluster, cluster);
+ delayed_refs->run_delayed_start =
+ head->node.bytenr;
+ count++;
+
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ } else if (count) {
+ /* the goal of the clustering is to find extents
+ * that are likely to end up in the same extent
+ * leaf on disk. So, we don't want them spread
+ * all over the tree. Stop now if we've hit
+ * a head that was already in use
+ */
+ break;
+ }
+ }
+ node = rb_next(node);
+ }
+ if (count) {
+ return 0;
+ } else if (start) {
+ /*
+ * we've gone to the end of the rbtree without finding any
+ * clusters. start from the beginning and try again
+ */
+ start = 0;
+ node = rb_first(&delayed_refs->root);
+ goto again;
+ }
+ return 1;
+}
+
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr. It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree. There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *prev_node;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref) {
+ prev_node = rb_prev(&ref->rb_node);
+ if (!prev_node)
+ goto out;
+ ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr == bytenr)
+ ret = 1;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ u32 num_refs;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ delayed_refs = &trans->transaction->delayed_refs;
+again:
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ } else {
+ num_refs = 0;
+ ret = 0;
+ }
+
+ spin_lock(&delayed_refs->lock);
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (mutex_trylock(&head->mutex)) {
+ num_refs += ref->ref_mod;
+ mutex_unlock(&head->mutex);
+ *refs = num_refs;
+ goto out;
+ }
+
+ atomic_inc(&ref->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+ goto again;
+ } else {
+ *refs = num_refs;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree. existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref *existing_ref;
+ struct btrfs_delayed_ref *ref;
+
+ existing_ref = btrfs_delayed_node_to_ref(existing);
+ ref = btrfs_delayed_node_to_ref(update);
+
+ if (ref->pin)
+ existing_ref->pin = 1;
+
+ if (ref->action != existing_ref->action) {
+ /*
+ * this is effectively undoing either an add or a
+ * drop. We decrement the ref_mod, and if it goes
+ * down to zero we just delete the entry without
+ * every changing the extent allocation tree.
+ */
+ existing->ref_mod--;
+ if (existing->ref_mod == 0) {
+ rb_erase(&existing->rb_node,
+ &delayed_refs->root);
+ existing->in_tree = 0;
+ btrfs_put_delayed_ref(existing);
+ delayed_refs->num_entries--;
+ if (trans->delayed_ref_updates)
+ trans->delayed_ref_updates--;
+ }
+ } else {
+ if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+ /* if we're adding refs, make sure all the
+ * details match up. The extent could
+ * have been totally freed and reallocated
+ * by a different owner before the delayed
+ * ref entries were removed.
+ */
+ existing_ref->owner_objectid = ref->owner_objectid;
+ existing_ref->generation = ref->generation;
+ existing_ref->root = ref->root;
+ existing->num_bytes = update->num_bytes;
+ }
+ /*
+ * the action on the existing ref matches
+ * the action on the ref we're trying to add.
+ * Bump the ref_mod by one so the backref that
+ * is eventually added/removed has the correct
+ * reference count
+ */
+ existing->ref_mod += update->ref_mod;
+ }
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref_head *existing_ref;
+ struct btrfs_delayed_ref_head *ref;
+
+ existing_ref = btrfs_delayed_node_to_head(existing);
+ ref = btrfs_delayed_node_to_head(update);
+
+ if (ref->must_insert_reserved) {
+ /* if the extent was freed and then
+ * reallocated before the delayed ref
+ * entries were processed, we can end up
+ * with an existing head ref without
+ * the must_insert_reserved flag set.
+ * Set it again here
+ */
+ existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+ /*
+ * update the num_bytes so we make sure the accounting
+ * is done correctly
+ */
+ existing->num_bytes = update->num_bytes;
+
+ }
+
+ /*
+ * update the reference mod on the head to reflect this new operation
+ */
+ existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_ref *full_ref;
+ struct btrfs_delayed_ref_head *head_ref = NULL;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int count_mod = 1;
+ int must_insert_reserved = 0;
+
+ /*
+ * the head node stores the sum of all the mods, so dropping a ref
+ * should drop the sum in the head node by one.
+ */
+ if (parent == (u64)-1) {
+ if (action == BTRFS_DROP_DELAYED_REF)
+ count_mod = -1;
+ else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+ count_mod = 0;
+ }
+
+ /*
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+ * the reserved accounting when the extent is finally added, or
+ * if a later modification deletes the delayed ref without ever
+ * inserting the extent into the extent allocation tree.
+ * ref->must_insert_reserved is the flag used to record
+ * that accounting mods are required.
+ *
+ * Once we record must_insert_reserved, switch the action to
+ * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+ */
+ if (action == BTRFS_ADD_DELAYED_EXTENT) {
+ must_insert_reserved = 1;
+ action = BTRFS_ADD_DELAYED_REF;
+ } else {
+ must_insert_reserved = 0;
+ }
+
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->parent = parent;
+ ref->ref_mod = count_mod;
+ ref->in_tree = 1;
+ ref->num_bytes = num_bytes;
+
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head_ref = btrfs_delayed_node_to_head(ref);
+ head_ref->must_insert_reserved = must_insert_reserved;
+ INIT_LIST_HEAD(&head_ref->cluster);
+ mutex_init(&head_ref->mutex);
+ } else {
+ full_ref = btrfs_delayed_node_to_ref(ref);
+ full_ref->root = ref_root;
+ full_ref->generation = ref_generation;
+ full_ref->owner_objectid = owner_objectid;
+ full_ref->pin = pin;
+ full_ref->action = action;
+ }
+
+ existing = tree_insert(&delayed_refs->root, bytenr,
+ parent, &ref->rb_node);
+
+ if (existing) {
+ if (btrfs_delayed_ref_is_head(ref))
+ update_existing_head_ref(existing, ref);
+ else
+ update_existing_ref(trans, delayed_refs, existing, ref);
+
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kfree(ref);
+ } else {
+ if (btrfs_delayed_ref_is_head(ref)) {
+ delayed_refs->num_heads++;
+ delayed_refs->num_heads_ready++;
+ }
+ delayed_refs->num_entries++;
+ trans->delayed_ref_updates++;
+ }
+ return 0;
+}
+
+/*
+ * add a delayed ref to the tree. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin)
+{
+ struct btrfs_delayed_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ /*
+ * the parent = 0 case comes from cases where we don't actually
+ * know the parent yet. It will get updated later via a add/drop
+ * pair.
+ */
+ if (parent == 0)
+ parent = bytenr;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+ (u64)-1, 0, 0, 0, action, pin);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, ref_generation,
+ owner_objectid, action, pin);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref)
+ return btrfs_delayed_node_to_head(ref);
+ return NULL;
+}
+
+/*
+ * add a delayed ref to the tree. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin)
+{
+ struct btrfs_delayed_ref *ref;
+ struct btrfs_delayed_ref *old_ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+ if (!old_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+
+ /*
+ * the parent = 0 case comes from cases where we don't actually
+ * know the parent yet. It will get updated later via a add/drop
+ * pair.
+ */
+ if (parent == 0)
+ parent = bytenr;
+ if (orig_parent == 0)
+ orig_parent = bytenr;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ kfree(old_ref);
+ return -ENOMEM;
+ }
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+ (u64)-1, 0, 0, 0,
+ BTRFS_UPDATE_DELAYED_HEAD, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, ref_generation,
+ owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+ orig_parent, orig_ref_root,
+ orig_ref_generation, owner_objectid,
+ BTRFS_DROP_DELAYED_REF, pin);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+ struct rb_node rb_node;
+
+ /* the starting bytenr of the extent */
+ u64 bytenr;
+
+ /* the parent our backref will point to */
+ u64 parent;
+
+ /* the size of the extent */
+ u64 num_bytes;
+
+ /* ref count on this data structure */
+ atomic_t refs;
+
+ /*
+ * how many refs is this entry adding or deleting. For
+ * head refs, this may be a negative number because it is keeping
+ * track of the total mods done to the reference count.
+ * For individual refs, this will always be a positive number
+ *
+ * It may be more than one, since it is possible for a single
+ * parent to have more than one ref on an extent
+ */
+ int ref_mod;
+
+ /* is this node still in the rbtree? */
+ unsigned int in_tree:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent. They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+ struct btrfs_delayed_ref_node node;
+
+ /*
+ * the mutex is held while running the refs, and it is also
+ * held when checking the sum of reference modifications.
+ */
+ struct mutex mutex;
+
+ struct list_head cluster;
+
+ /*
+ * when a new extent is allocated, it is just reserved in memory
+ * The actual extent isn't inserted into the extent allocation tree
+ * until the delayed ref is processed. must_insert_reserved is
+ * used to flag a delayed ref so the accounting can be updated
+ * when a full insert is done.
+ *
+ * It is possible the extent will be freed before it is ever
+ * inserted into the extent allocation tree. In this case
+ * we need to update the in ram accounting to properly reflect
+ * the free has happened.
+ */
+ unsigned int must_insert_reserved:1;
+};
+
+struct btrfs_delayed_ref {
+ struct btrfs_delayed_ref_node node;
+
+ /* the root objectid our ref will point to */
+ u64 root;
+
+ /* the generation for the backref */
+ u64 generation;
+
+ /* owner_objectid of the backref */
+ u64 owner_objectid;
+
+ /* operation done by this entry in the rbtree */
+ u8 action;
+
+ /* if pin == 1, when the extent is freed it will be pinned until
+ * transaction commit
+ */
+ unsigned int pin:1;
+};
+
+struct btrfs_delayed_ref_root {
+ struct rb_root root;
+
+ /* this spin lock protects the rbtree and the entries inside */
+ spinlock_t lock;
+
+ /* how many delayed ref updates we've queued, used by the
+ * throttling code
+ */
+ unsigned long num_entries;
+
+ /* total number of head nodes in tree */
+ unsigned long num_heads;
+
+ /* total number of head nodes ready for processing */
+ unsigned long num_heads_ready;
+
+ /*
+ * set when the tree is flushing before a transaction commit,
+ * used by the throttling code to decide if new updates need
+ * to be run right away
+ */
+ int flushing;
+
+ u64 run_delayed_start;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ WARN_ON(atomic_read(&ref->refs) == 0);
+ if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(ref->in_tree);
+ kfree(ref);
+ }
+}
+
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+ return node->parent == (u64)-1;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref, node);
+
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(!btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref_head, node);
+
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
key.objectid = dir;
btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
key.offset = btrfs_name_hash(name, name_len);
+
path = btrfs_alloc_path();
+ path->leave_spinning = 1;
+
data_size = sizeof(*dir_item) + name_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc869..0ff16d3331da 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
#include "locking.h"
#include "ref-cache.h"
#include "tree-log.h"
+#include "free-space-cache.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -231,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk(KERN_INFO "btrfs: %s checksum verify failed "
- "on %llu wanted %X found %X level %d\n",
- root->fs_info->sb->s_id,
- buf->start, val, found, btrfs_header_level(buf));
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs: %s checksum verify "
+ "failed on %llu wanted %X found %X "
+ "level %d\n",
+ root->fs_info->sb->s_id,
+ (unsigned long long)buf->start, val, found,
+ btrfs_header_level(buf));
+ }
if (result != (char *)&inline_result)
kfree(result);
return 1;
@@ -267,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk("parent transid verify failed on %llu wanted %llu found %llu\n",
- (unsigned long long)eb->start,
- (unsigned long long)parent_transid,
- (unsigned long long)btrfs_header_generation(eb));
+ if (printk_ratelimit()) {
+ printk("parent transid verify failed on %llu wanted %llu "
+ "found %llu\n",
+ (unsigned long long)eb->start,
+ (unsigned long long)parent_transid,
+ (unsigned long long)btrfs_header_generation(eb));
+ }
ret = 1;
clear_extent_buffer_uptodate(io_tree, eb);
out:
@@ -414,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
- printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
- (unsigned long long)found_start,
- (unsigned long long)eb->start);
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs bad tree block start "
+ "%llu %llu\n",
+ (unsigned long long)found_start,
+ (unsigned long long)eb->start);
+ }
ret = -EIO;
goto err;
}
@@ -428,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk(KERN_INFO "btrfs bad fsid on block %llu\n",
- (unsigned long long)eb->start);
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+ (unsigned long long)eb->start);
+ }
ret = -EIO;
goto err;
}
@@ -578,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags;
atomic_inc(&fs_info->nr_async_submits);
+
+ if (rw & (1 << BIO_RW_SYNCIO))
+ btrfs_set_work_high_prio(&async->work);
+
btrfs_queue_worker(&fs_info->workers, &async->work);
-#if 0
- int limit = btrfs_async_submit_limit(fs_info);
- if (atomic_read(&fs_info->nr_async_submits) > limit) {
- wait_event_timeout(fs_info->async_submit_wait,
- (atomic_read(&fs_info->nr_async_submits) < limit),
- HZ/10);
- wait_event_timeout(fs_info->async_submit_wait,
- (atomic_read(&fs_info->nr_async_bios) < limit),
- HZ/10);
- }
-#endif
while (atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
wait_event(fs_info->async_submit_wait,
@@ -655,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0);
}
+
/*
* kthread helpers are used to submit writes so that checksumming
* can happen in parallel across all CPUs
@@ -668,14 +675,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
static int btree_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct extent_buffer *eb;
+ int was_dirty;
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (!(current->flags & PF_MEMALLOC)) {
+ return extent_write_full_page(tree, page,
+ btree_get_extent, wbc);
+ }
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
+ redirty_page_for_writepage(wbc, page);
+ eb = btrfs_find_tree_block(root, page_offset(page),
+ PAGE_CACHE_SIZE);
+ WARN_ON(!eb);
+
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+ spin_unlock(&root->fs_info->delalloc_lock);
}
- return extent_write_full_page(tree, page, btree_get_extent, wbc);
+ free_extent_buffer(eb);
+
+ unlock_page(page);
+ return 0;
}
static int btree_writepages(struct address_space *mapping,
@@ -684,15 +708,15 @@ static int btree_writepages(struct address_space *mapping,
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
if (wbc->sync_mode == WB_SYNC_NONE) {
+ struct btrfs_root *root = BTRFS_I(mapping->host)->root;
u64 num_dirty;
- u64 start = 0;
unsigned long thresh = 32 * 1024 * 1024;
if (wbc->for_kupdate)
return 0;
- num_dirty = count_range_bits(tree, &start, (u64)-1,
- thresh, EXTENT_DIRTY);
+ /* this is a bit racy, but that's ok */
+ num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty < thresh)
return 0;
}
@@ -747,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
-#if 0
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct buffer_head *bh;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct buffer_head *head;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, root->fs_info->sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
- }
- head = page_buffers(page);
- bh = head;
- do {
- if (buffer_dirty(bh))
- csum_tree_block(root, bh, 0);
- bh = bh->b_this_page;
- } while (bh != head);
- return block_write_full_page(page, btree_get_block, wbc);
-}
-#endif
-
static struct address_space_operations btree_aops = {
.readpage = btree_readpage,
.writepage = btree_writepage,
@@ -859,9 +862,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
root->fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
- /* ugh, clear_extent_buffer_dirty can be expensive */
- btrfs_set_lock_blocking(buf);
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= buf->len)
+ root->fs_info->dirty_metadata_bytes -= buf->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
buf);
}
@@ -1247,11 +1258,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
int ret = 0;
struct btrfs_device *device;
struct backing_dev_info *bdi;
-#if 0
- if ((bdi_bits & (1 << BDI_write_congested)) &&
- btrfs_congested_async(info, 0))
- return 1;
-#endif
+
list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
if (!device->bdev)
continue;
@@ -1387,8 +1394,6 @@ static int bio_ready_for_csum(struct bio *bio)
ret = extent_range_uptodate(io_tree, start + length,
start + buf_len - 1);
- if (ret == 1)
- return ret;
return ret;
}
@@ -1471,12 +1476,6 @@ static int transaction_kthread(void *arg)
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
- if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
- printk(KERN_INFO "btrfs: total reference cache "
- "size %llu\n",
- root->fs_info->total_ref_cache_size);
- }
-
mutex_lock(&root->fs_info->trans_mutex);
cur = root->fs_info->running_transaction;
if (!cur) {
@@ -1493,6 +1492,7 @@ static int transaction_kthread(void *arg)
mutex_unlock(&root->fs_info->trans_mutex);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
+
sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1552,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->ordered_operations);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
@@ -1579,6 +1580,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->btree_inode = new_inode(sb);
fs_info->btree_inode->i_ino = 1;
fs_info->btree_inode->i_nlink = 1;
+ fs_info->metadata_ratio = 8;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1611,10 +1613,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
extent_io_tree_init(&fs_info->pinned_extents,
fs_info->btree_inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&fs_info->pending_del,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&fs_info->extent_ins,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
fs_info->do_barriers = 1;
INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,15 +1625,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
insert_inode_hash(fs_info->btree_inode);
mutex_init(&fs_info->trans_mutex);
+ mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->drop_mutex);
- mutex_init(&fs_info->extent_ins_mutex);
- mutex_init(&fs_info->pinned_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->tree_reloc_mutex);
+
+ btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+ btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
init_waitqueue_head(&fs_info->transaction_throttle);
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1670,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (features) {
printk(KERN_ERR "BTRFS: couldn't mount because of "
"unsupported optional features (%Lx).\n",
- features);
+ (unsigned long long)features);
err = -EINVAL;
goto fail_iput;
}
@@ -1680,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!(sb->s_flags & MS_RDONLY) && features) {
printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
"unsupported option features (%Lx).\n",
- features);
+ (unsigned long long)features);
err = -EINVAL;
goto fail_iput;
}
@@ -2076,10 +2077,10 @@ static int write_dev_supers(struct btrfs_device *device,
device->barriers = 0;
get_bh(bh);
lock_buffer(bh);
- ret = submit_bh(WRITE, bh);
+ ret = submit_bh(WRITE_SYNC, bh);
}
} else {
- ret = submit_bh(WRITE, bh);
+ ret = submit_bh(WRITE_SYNC, bh);
}
if (!ret && wait) {
@@ -2272,7 +2273,7 @@ int close_ctree(struct btrfs_root *root)
if (fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
- fs_info->delalloc_bytes);
+ (unsigned long long)fs_info->delalloc_bytes);
}
if (fs_info->total_ref_cache_size) {
printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
@@ -2309,16 +2310,6 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
-#if 0
- while (!list_empty(&fs_info->hashers)) {
- struct btrfs_hasher *hasher;
- hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
- hashers);
- list_del(&hasher->hashers);
- crypto_free_hash(&fs_info->hash_tfm);
- kfree(hasher);
- }
-#endif
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2358,8 +2349,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
u64 transid = btrfs_header_generation(buf);
struct inode *btree_inode = root->fs_info->btree_inode;
-
- btrfs_set_lock_blocking(buf);
+ int was_dirty;
btrfs_assert_tree_locked(buf);
if (transid != root->fs_info->generation) {
@@ -2370,7 +2360,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)root->fs_info->generation);
WARN_ON(1);
}
- set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+ was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += buf->len;
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
}
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2406,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
int btree_lock_page_hook(struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_buffer *eb;
unsigned long len;
@@ -2425,6 +2422,16 @@ int btree_lock_page_hook(struct page *page)
btrfs_tree_lock(eb);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= eb->len)
+ root->fs_info->dirty_metadata_bytes -= eb->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227be..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad2059..e4966444811b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
#include "volumes.h"
#include "locking.h"
#include "ref-cache.h"
+#include "free-space-cache.h"
#define PENDING_EXTENT_INSERT 0
#define PENDING_EXTENT_DELETE 1
@@ -49,17 +50,23 @@ struct pending_extent_op {
int del;
};
-static int finish_current_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int is_data);
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins,
+ int ref_mod);
+static int update_reserved_extents(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserve);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin,
+ int ref_to_drop);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -160,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
u64 extent_start, extent_end, size;
int ret;
- mutex_lock(&info->pinned_mutex);
while (start < end) {
ret = find_first_extent_bit(&info->pinned_extents, start,
&extent_start, &extent_end,
@@ -186,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
ret = btrfs_add_free_space(block_group, start, size);
BUG_ON(ret);
}
- mutex_unlock(&info->pinned_mutex);
return 0;
}
@@ -285,8 +290,8 @@ next:
block_group->key.objectid +
block_group->key.offset);
- remove_sb_from_cache(root, block_group);
block_group->cached = 1;
+ remove_sb_from_cache(root, block_group);
ret = 0;
err:
btrfs_free_path(path);
@@ -320,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
return cache;
}
-static inline void put_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
{
if (atomic_dec_and_test(&cache->count))
kfree(cache);
@@ -393,12 +398,12 @@ again:
div_factor(cache->key.offset, factor)) {
group_start = cache->key.objectid;
spin_unlock(&cache->lock);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
goto found;
}
}
spin_unlock(&cache->lock);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
cond_resched();
}
if (!wrapped) {
@@ -554,262 +559,13 @@ out:
return ret;
}
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct list_head *update_list)
-{
- struct btrfs_key key;
- struct btrfs_extent_ref *ref;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct pending_extent_op *op;
- struct extent_buffer *leaf;
- int ret = 0;
- struct list_head *cur = update_list->next;
- u64 ref_objectid;
- u64 ref_root = extent_root->root_key.objectid;
-
- op = list_entry(cur, struct pending_extent_op, list);
-
-search:
- key.objectid = op->bytenr;
- key.type = BTRFS_EXTENT_REF_KEY;
- key.offset = op->orig_parent;
-
- ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
- BUG_ON(ret);
-
- leaf = path->nodes[0];
-
-loop:
- ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-
- ref_objectid = btrfs_ref_objectid(leaf, ref);
-
- if (btrfs_ref_root(leaf, ref) != ref_root ||
- btrfs_ref_generation(leaf, ref) != op->orig_generation ||
- (ref_objectid != op->level &&
- ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
- printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
- "root %llu, owner %u\n",
- (unsigned long long)op->bytenr,
- (unsigned long long)op->orig_parent,
- (unsigned long long)ref_root, op->level);
- btrfs_print_leaf(extent_root, leaf);
- BUG();
- }
-
- key.objectid = op->bytenr;
- key.offset = op->parent;
- key.type = BTRFS_EXTENT_REF_KEY;
- ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
- BUG_ON(ret);
- ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
- btrfs_set_ref_generation(leaf, ref, op->generation);
-
- cur = cur->next;
-
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
-
- if (cur == update_list) {
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(extent_root, path);
- goto out;
- }
-
- op = list_entry(cur, struct pending_extent_op, list);
-
- path->slots[0]++;
- while (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid == op->bytenr &&
- key.type == BTRFS_EXTENT_REF_KEY)
- goto loop;
- path->slots[0]++;
- }
-
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(extent_root, path);
- goto search;
-
-out:
- return 0;
-}
-
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct list_head *insert_list, int nr)
-{
- struct btrfs_key *keys;
- u32 *data_size;
- struct pending_extent_op *op;
- struct extent_buffer *leaf;
- struct list_head *cur = insert_list->next;
- struct btrfs_fs_info *info = extent_root->fs_info;
- u64 ref_root = extent_root->root_key.objectid;
- int i = 0, last = 0, ret;
- int total = nr * 2;
-
- if (!nr)
- return 0;
-
- keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
- if (!keys)
- return -ENOMEM;
-
- data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
- if (!data_size) {
- kfree(keys);
- return -ENOMEM;
- }
-
- list_for_each_entry(op, insert_list, list) {
- keys[i].objectid = op->bytenr;
- keys[i].offset = op->num_bytes;
- keys[i].type = BTRFS_EXTENT_ITEM_KEY;
- data_size[i] = sizeof(struct btrfs_extent_item);
- i++;
-
- keys[i].objectid = op->bytenr;
- keys[i].offset = op->parent;
- keys[i].type = BTRFS_EXTENT_REF_KEY;
- data_size[i] = sizeof(struct btrfs_extent_ref);
- i++;
- }
-
- op = list_entry(cur, struct pending_extent_op, list);
- i = 0;
- while (i < total) {
- int c;
- ret = btrfs_insert_some_items(trans, extent_root, path,
- keys+i, data_size+i, total-i);
- BUG_ON(ret < 0);
-
- if (last && ret > 1)
- BUG();
-
- leaf = path->nodes[0];
- for (c = 0; c < ret; c++) {
- int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-
- /*
- * if the first item we inserted was a backref, then
- * the EXTENT_ITEM will be the odd c's, else it will
- * be the even c's
- */
- if ((ref_first && (c % 2)) ||
- (!ref_first && !(c % 2))) {
- struct btrfs_extent_item *itm;
-
- itm = btrfs_item_ptr(leaf, path->slots[0] + c,
- struct btrfs_extent_item);
- btrfs_set_extent_refs(path->nodes[0], itm, 1);
- op->del++;
- } else {
- struct btrfs_extent_ref *ref;
-
- ref = btrfs_item_ptr(leaf, path->slots[0] + c,
- struct btrfs_extent_ref);
- btrfs_set_ref_root(leaf, ref, ref_root);
- btrfs_set_ref_generation(leaf, ref,
- op->generation);
- btrfs_set_ref_objectid(leaf, ref, op->level);
- btrfs_set_ref_num_refs(leaf, ref, 1);
- op->del++;
- }
-
- /*
- * using del to see when its ok to free up the
- * pending_extent_op. In the case where we insert the
- * last item on the list in order to help do batching
- * we need to not free the extent op until we actually
- * insert the extent_item
- */
- if (op->del == 2) {
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1,
- GFP_NOFS);
- cur = cur->next;
- list_del_init(&op->list);
- kfree(op);
- if (cur != insert_list)
- op = list_entry(cur,
- struct pending_extent_op,
- list);
- }
- }
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(extent_root, path);
-
- /*
- * Ok backref's and items usually go right next to eachother,
- * but if we could only insert 1 item that means that we
- * inserted on the end of a leaf, and we have no idea what may
- * be on the next leaf so we just play it safe. In order to
- * try and help this case we insert the last thing on our
- * insert list so hopefully it will end up being the last
- * thing on the leaf and everything else will be before it,
- * which will let us insert a whole bunch of items at the same
- * time.
- */
- if (ret == 1 && !last && (i + ret < total)) {
- /*
- * last: where we will pick up the next time around
- * i: our current key to insert, will be total - 1
- * cur: the current op we are screwing with
- * op: duh
- */
- last = i + ret;
- i = total - 1;
- cur = insert_list->prev;
- op = list_entry(cur, struct pending_extent_op, list);
- } else if (last) {
- /*
- * ok we successfully inserted the last item on the
- * list, lets reset everything
- *
- * i: our current key to insert, so where we left off
- * last time
- * last: done with this
- * cur: the op we are messing with
- * op: duh
- * total: since we inserted the last key, we need to
- * decrement total so we dont overflow
- */
- i = last;
- last = 0;
- total--;
- if (i < total) {
- cur = insert_list->next;
- op = list_entry(cur, struct pending_extent_op,
- list);
- }
- } else {
- i += ret;
- }
-
- cond_resched();
- }
- ret = 0;
- kfree(keys);
- kfree(data_size);
- return ret;
-}
-
static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 bytenr, u64 parent,
u64 ref_root, u64 ref_generation,
- u64 owner_objectid)
+ u64 owner_objectid,
+ int refs_to_add)
{
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -829,9 +585,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
btrfs_set_ref_root(leaf, ref, ref_root);
btrfs_set_ref_generation(leaf, ref, ref_generation);
btrfs_set_ref_objectid(leaf, ref, owner_objectid);
- btrfs_set_ref_num_refs(leaf, ref, 1);
+ btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
} else if (ret == -EEXIST) {
u64 existing_owner;
+
BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +602,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
num_refs = btrfs_ref_num_refs(leaf, ref);
BUG_ON(num_refs == 0);
- btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+ btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
existing_owner = btrfs_ref_objectid(leaf, ref);
if (existing_owner != owner_objectid &&
@@ -857,6 +614,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
} else {
goto out;
}
+ btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
btrfs_release_path(root, path);
@@ -865,7 +623,8 @@ out:
static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ int refs_to_drop)
{
struct extent_buffer *leaf;
struct btrfs_extent_ref *ref;
@@ -875,8 +634,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
num_refs = btrfs_ref_num_refs(leaf, ref);
- BUG_ON(num_refs == 0);
- num_refs -= 1;
+ BUG_ON(num_refs < refs_to_drop);
+ num_refs -= refs_to_drop;
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
} else {
@@ -927,332 +686,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
#endif
}
-static noinline int free_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct list_head *del_list)
-{
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_path *path;
- struct btrfs_key key, found_key;
- struct extent_buffer *leaf;
- struct list_head *cur;
- struct pending_extent_op *op;
- struct btrfs_extent_item *ei;
- int ret, num_to_del, extent_slot = 0, found_extent = 0;
- u32 refs;
- u64 bytes_freed = 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = 1;
-
-search:
- /* search for the backref for the current ref we want to delete */
- cur = del_list->next;
- op = list_entry(cur, struct pending_extent_op, list);
- ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
- op->orig_parent,
- extent_root->root_key.objectid,
- op->orig_generation, op->level, 1);
- if (ret) {
- printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
- "root %llu gen %llu owner %u\n",
- (unsigned long long)op->bytenr,
- (unsigned long long)extent_root->root_key.objectid,
- (unsigned long long)op->orig_generation, op->level);
- btrfs_print_leaf(extent_root, path->nodes[0]);
- WARN_ON(1);
- goto out;
- }
-
- extent_slot = path->slots[0];
- num_to_del = 1;
- found_extent = 0;
-
- /*
- * if we aren't the first item on the leaf we can move back one and see
- * if our ref is right next to our extent item
- */
- if (likely(extent_slot)) {
- extent_slot--;
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- extent_slot);
- if (found_key.objectid == op->bytenr &&
- found_key.type == BTRFS_EXTENT_ITEM_KEY &&
- found_key.offset == op->num_bytes) {
- num_to_del++;
- found_extent = 1;
- }
- }
-
- /*
- * if we didn't find the extent we need to delete the backref and then
- * search for the extent item key so we can update its ref count
- */
- if (!found_extent) {
- key.objectid = op->bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = op->num_bytes;
-
- ret = remove_extent_backref(trans, extent_root, path);
- BUG_ON(ret);
- btrfs_release_path(extent_root, path);
- ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
- BUG_ON(ret);
- extent_slot = path->slots[0];
- }
-
- /* this is where we update the ref count for the extent */
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
- refs = btrfs_extent_refs(leaf, ei);
- BUG_ON(refs == 0);
- refs--;
- btrfs_set_extent_refs(leaf, ei, refs);
-
- btrfs_mark_buffer_dirty(leaf);
-
- /*
- * This extent needs deleting. The reason cur_slot is extent_slot +
- * num_to_del is because extent_slot points to the slot where the extent
- * is, and if the backref was not right next to the extent we will be
- * deleting at least 1 item, and will want to start searching at the
- * slot directly next to extent_slot. However if we did find the
- * backref next to the extent item them we will be deleting at least 2
- * items and will want to start searching directly after the ref slot
- */
- if (!refs) {
- struct list_head *pos, *n, *end;
- int cur_slot = extent_slot+num_to_del;
- u64 super_used;
- u64 root_used;
-
- path->slots[0] = extent_slot;
- bytes_freed = op->num_bytes;
-
- mutex_lock(&info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, op->bytenr,
- op->num_bytes, op->level >=
- BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&info->pinned_mutex);
- BUG_ON(ret < 0);
- op->del = ret;
-
- /*
- * we need to see if we can delete multiple things at once, so
- * start looping through the list of extents we are wanting to
- * delete and see if their extent/backref's are right next to
- * eachother and the extents only have 1 ref
- */
- for (pos = cur->next; pos != del_list; pos = pos->next) {
- struct pending_extent_op *tmp;
-
- tmp = list_entry(pos, struct pending_extent_op, list);
-
- /* we only want to delete extent+ref at this stage */
- if (cur_slot >= btrfs_header_nritems(leaf) - 1)
- break;
-
- btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
- if (found_key.objectid != tmp->bytenr ||
- found_key.type != BTRFS_EXTENT_ITEM_KEY ||
- found_key.offset != tmp->num_bytes)
- break;
-
- /* check to make sure this extent only has one ref */
- ei = btrfs_item_ptr(leaf, cur_slot,
- struct btrfs_extent_item);
- if (btrfs_extent_refs(leaf, ei) != 1)
- break;
-
- btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
- if (found_key.objectid != tmp->bytenr ||
- found_key.type != BTRFS_EXTENT_REF_KEY ||
- found_key.offset != tmp->orig_parent)
- break;
-
- /*
- * the ref is right next to the extent, we can set the
- * ref count to 0 since we will delete them both now
- */
- btrfs_set_extent_refs(leaf, ei, 0);
-
- /* pin down the bytes for this extent */
- mutex_lock(&info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
- tmp->num_bytes, tmp->level >=
- BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&info->pinned_mutex);
- BUG_ON(ret < 0);
-
- /*
- * use the del field to tell if we need to go ahead and
- * free up the extent when we delete the item or not.
- */
- tmp->del = ret;
- bytes_freed += tmp->num_bytes;
-
- num_to_del += 2;
- cur_slot += 2;
- }
- end = pos;
-
- /* update the free space counters */
- spin_lock(&info->delalloc_lock);
- super_used = btrfs_super_bytes_used(&info->super_copy);
- btrfs_set_super_bytes_used(&info->super_copy,
- super_used - bytes_freed);
-
- root_used = btrfs_root_used(&extent_root->root_item);
- btrfs_set_root_used(&extent_root->root_item,
- root_used - bytes_freed);
- spin_unlock(&info->delalloc_lock);
-
- /* delete the items */
- ret = btrfs_del_items(trans, extent_root, path,
- path->slots[0], num_to_del);
- BUG_ON(ret);
-
- /*
- * loop through the extents we deleted and do the cleanup work
- * on them
- */
- for (pos = cur, n = pos->next; pos != end;
- pos = n, n = pos->next) {
- struct pending_extent_op *tmp;
- tmp = list_entry(pos, struct pending_extent_op, list);
-
- /*
- * remember tmp->del tells us wether or not we pinned
- * down the extent
- */
- ret = update_block_group(trans, extent_root,
- tmp->bytenr, tmp->num_bytes, 0,
- tmp->del);
- BUG_ON(ret);
-
- list_del_init(&tmp->list);
- unlock_extent(&info->extent_ins, tmp->bytenr,
- tmp->bytenr + tmp->num_bytes - 1,
- GFP_NOFS);
- kfree(tmp);
- }
- } else if (refs && found_extent) {
- /*
- * the ref and extent were right next to eachother, but the
- * extent still has a ref, so just free the backref and keep
- * going
- */
- ret = remove_extent_backref(trans, extent_root, path);
- BUG_ON(ret);
-
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
- } else {
- /*
- * the extent has multiple refs and the backref we were looking
- * for was not right next to it, so just unlock and go next,
- * we're good to go
- */
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
- }
-
- btrfs_release_path(extent_root, path);
- if (!list_empty(del_list))
- goto search;
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes,
u64 orig_parent, u64 parent,
u64 orig_root, u64 ref_root,
u64 orig_generation, u64 ref_generation,
u64 owner_objectid)
{
int ret;
- struct btrfs_root *extent_root = root->fs_info->extent_root;
- struct btrfs_path *path;
+ int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
- if (root == root->fs_info->extent_root) {
- struct pending_extent_op *extent_op;
- u64 num_bytes;
-
- BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
- num_bytes = btrfs_level_size(root, (int)owner_objectid);
- mutex_lock(&root->fs_info->extent_ins_mutex);
- if (test_range_bit(&root->fs_info->extent_ins, bytenr,
- bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
- u64 priv;
- ret = get_state_private(&root->fs_info->extent_ins,
- bytenr, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
- BUG_ON(extent_op->parent != orig_parent);
- BUG_ON(extent_op->generation != orig_generation);
-
- extent_op->parent = parent;
- extent_op->generation = ref_generation;
- } else {
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_BACKREF_UPDATE;
- extent_op->bytenr = bytenr;
- extent_op->num_bytes = num_bytes;
- extent_op->parent = parent;
- extent_op->orig_parent = orig_parent;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = orig_generation;
- extent_op->level = (int)owner_objectid;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- set_extent_bits(&root->fs_info->extent_ins,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->extent_ins,
- bytenr, (unsigned long)extent_op);
- }
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = lookup_extent_backref(trans, extent_root, path,
- bytenr, orig_parent, orig_root,
- orig_generation, owner_objectid, 1);
- if (ret)
- goto out;
- ret = remove_extent_backref(trans, extent_root, path);
- if (ret)
- goto out;
- ret = insert_extent_backref(trans, extent_root, path, bytenr,
- parent, ref_root, ref_generation,
- owner_objectid);
+ ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+ orig_parent, parent, orig_root,
+ ref_root, orig_generation,
+ ref_generation, owner_objectid, pin);
BUG_ON(ret);
- finish_current_insert(trans, extent_root, 0);
- del_pending_extents(trans, extent_root, 0);
-out:
- btrfs_free_path(path);
return ret;
}
int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
- u64 orig_parent, u64 parent,
+ u64 num_bytes, u64 orig_parent, u64 parent,
u64 ref_root, u64 ref_generation,
u64 owner_objectid)
{
@@ -1260,20 +715,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
return 0;
- ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
- parent, ref_root, ref_root,
- ref_generation, ref_generation,
- owner_objectid);
+
+ ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+ orig_parent, parent, ref_root,
+ ref_root, ref_generation,
+ ref_generation, owner_objectid);
return ret;
}
-
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes,
u64 orig_parent, u64 parent,
u64 orig_root, u64 ref_root,
u64 orig_generation, u64 ref_generation,
u64 owner_objectid)
{
+ int ret;
+
+ ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+ ref_generation, owner_objectid,
+ BTRFS_ADD_DELAYED_REF, 0);
+ BUG_ON(ret);
+ return ret;
+}
+
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid,
+ int refs_to_add)
+{
struct btrfs_path *path;
int ret;
struct btrfs_key key;
@@ -1286,17 +757,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = 1;
+ path->leave_spinning = 1;
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)-1;
+ key.offset = num_bytes;
- ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
- 0, 1);
- if (ret < 0)
+ /* first find the extent item and update its reference count */
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+ path, 0, 1);
+ if (ret < 0) {
+ btrfs_set_path_blocking(path);
return ret;
- BUG_ON(ret == 0 || path->slots[0] == 0);
+ }
- path->slots[0]--;
+ if (ret > 0) {
+ WARN_ON(1);
+ btrfs_free_path(path);
+ return -EIO;
+ }
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +788,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+
refs = btrfs_extent_refs(l, item);
- btrfs_set_extent_refs(l, item, refs + 1);
+ btrfs_set_extent_refs(l, item, refs + refs_to_add);
+ btrfs_unlock_up_safe(path, 1);
+
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(root->fs_info->extent_root, path);
path->reada = 1;
+ path->leave_spinning = 1;
+
+ /* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent,
ref_root, ref_generation,
- owner_objectid);
+ owner_objectid, refs_to_add);
BUG_ON(ret);
- finish_current_insert(trans, root->fs_info->extent_root, 0);
- del_pending_extents(trans, root->fs_info->extent_root, 0);
-
btrfs_free_path(path);
return 0;
}
@@ -1339,68 +820,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
return 0;
- ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+
+ ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
0, ref_root, 0, ref_generation,
owner_objectid);
return ret;
}
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node)
+{
+ int ret = 0;
+ struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+
+ BUG_ON(node->ref_mod == 0);
+ ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+ node->parent, ref->root, ref->generation,
+ ref->owner_objectid, ref->pin, node->ref_mod);
+
+ return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ int insert_reserved)
{
- u64 start;
- u64 end;
int ret;
+ struct btrfs_delayed_ref *ref;
- while(1) {
- finish_current_insert(trans, root->fs_info->extent_root, 1);
- del_pending_extents(trans, root->fs_info->extent_root, 1);
+ if (node->parent == (u64)-1) {
+ struct btrfs_delayed_ref_head *head;
+ /*
+ * we've hit the end of the chain and we were supposed
+ * to insert this extent into the tree. But, it got
+ * deleted before we ever needed to insert it, so all
+ * we have to do is clean up the accounting
+ */
+ if (insert_reserved) {
+ update_reserved_extents(root, node->bytenr,
+ node->num_bytes, 0);
+ }
+ head = btrfs_delayed_node_to_head(node);
+ mutex_unlock(&head->mutex);
+ return 0;
+ }
- /* is there more work to do? */
- ret = find_first_extent_bit(&root->fs_info->pending_del,
- 0, &start, &end, EXTENT_WRITEBACK);
- if (!ret)
- continue;
- ret = find_first_extent_bit(&root->fs_info->extent_ins,
- 0, &start, &end, EXTENT_WRITEBACK);
- if (!ret)
- continue;
- break;
+ ref = btrfs_delayed_node_to_ref(node);
+ if (ref->action == BTRFS_ADD_DELAYED_REF) {
+ if (insert_reserved) {
+ struct btrfs_key ins;
+
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ /* record the full extent allocation */
+ ret = __btrfs_alloc_reserved_extent(trans, root,
+ node->parent, ref->root,
+ ref->generation, ref->owner_objectid,
+ &ins, node->ref_mod);
+ update_reserved_extents(root, node->bytenr,
+ node->num_bytes, 0);
+ } else {
+ /* just add one backref */
+ ret = add_extent_ref(trans, root, node->bytenr,
+ node->num_bytes,
+ node->parent, ref->root, ref->generation,
+ ref->owner_objectid, node->ref_mod);
+ }
+ BUG_ON(ret);
+ } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+ WARN_ON(insert_reserved);
+ ret = drop_delayed_ref(trans, root, node);
}
return 0;
}
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u32 *refs)
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
- struct btrfs_path *path;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ int action = BTRFS_ADD_DELAYED_REF;
+again:
+ /*
+ * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * this prevents ref count from going down to zero when
+ * there still are pending delayed ref.
+ */
+ node = rb_prev(&head->node.rb_node);
+ while (1) {
+ if (!node)
+ break;
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr != head->node.bytenr)
+ break;
+ if (btrfs_delayed_node_to_ref(ref)->action == action)
+ return ref;
+ node = rb_prev(node);
+ }
+ if (action == BTRFS_ADD_DELAYED_REF) {
+ action = BTRFS_DROP_DELAYED_REF;
+ goto again;
+ }
+ return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct list_head *cluster)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
int ret;
- struct btrfs_key key;
- struct extent_buffer *l;
- struct btrfs_extent_item *item;
+ int count = 0;
+ int must_insert_reserved = 0;
- WARN_ON(num_bytes < root->sectorsize);
- path = btrfs_alloc_path();
- path->reada = 1;
- key.objectid = bytenr;
- key.offset = num_bytes;
- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
- ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
- 0, 0);
- if (ret < 0)
- goto out;
- if (ret != 0) {
- btrfs_print_leaf(root, path->nodes[0]);
- printk(KERN_INFO "btrfs failed to find block number %llu\n",
- (unsigned long long)bytenr);
- BUG();
+ delayed_refs = &trans->transaction->delayed_refs;
+ while (1) {
+ if (!locked_ref) {
+ /* pick a new head ref from the cluster list */
+ if (list_empty(cluster))
+ break;
+
+ locked_ref = list_entry(cluster->next,
+ struct btrfs_delayed_ref_head, cluster);
+
+ /* grab the lock that says we are going to process
+ * all the refs for this head */
+ ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+ /*
+ * we may have dropped the spin lock to get the head
+ * mutex lock, and that might have given someone else
+ * time to free the head. If that's true, it has been
+ * removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN) {
+ locked_ref = NULL;
+ count++;
+ continue;
+ }
+ }
+
+ /*
+ * record the must insert reserved flag before we
+ * drop the spin lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ /*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+ if (!ref) {
+ /* All delayed refs have been processed, Go ahead
+ * and send the head node to run_one_delayed_ref,
+ * so that any accounting fixes can happen
+ */
+ ref = &locked_ref->node;
+ list_del_init(&locked_ref->cluster);
+ locked_ref = NULL;
+ }
+
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root, ref,
+ must_insert_reserved);
+ BUG_ON(ret);
+ btrfs_put_delayed_ref(ref);
+
+ count++;
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far. count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct list_head cluster;
+ int ret;
+ int run_all = count == (unsigned long)-1;
+ int run_most = 0;
+
+ if (root == root->fs_info->extent_root)
+ root = root->fs_info->tree_root;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ INIT_LIST_HEAD(&cluster);
+again:
+ spin_lock(&delayed_refs->lock);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+ while (1) {
+ if (!(run_all || run_most) &&
+ delayed_refs->num_heads_ready < 64)
+ break;
+
+ /*
+ * go find something we can process in the rbtree. We start at
+ * the beginning of the tree, and then build a cluster
+ * of refs to process starting at the first one we are able to
+ * lock
+ */
+ ret = btrfs_find_ref_cluster(trans, &cluster,
+ delayed_refs->run_delayed_start);
+ if (ret)
+ break;
+
+ ret = run_clustered_refs(trans, root, &cluster);
+ BUG_ON(ret < 0);
+
+ count -= min_t(unsigned long, ret, count);
+
+ if (count == 0)
+ break;
+ }
+
+ if (run_all) {
+ node = rb_first(&delayed_refs->root);
+ if (!node)
+ goto out;
+ count = (unsigned long)-1;
+
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ struct btrfs_delayed_ref_head *head;
+
+ head = btrfs_delayed_node_to_head(ref);
+ atomic_inc(&ref->refs);
+
+ spin_unlock(&delayed_refs->lock);
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+
+ btrfs_put_delayed_ref(ref);
+ cond_resched();
+ goto again;
+ }
+ node = rb_next(node);
+ }
+ spin_unlock(&delayed_refs->lock);
+ schedule_timeout(1);
+ goto again;
}
- l = path->nodes[0];
- item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
- *refs = btrfs_extent_refs(l, item);
out:
- btrfs_free_path(path);
+ spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -1624,7 +1315,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
int refi = 0;
int slot;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64, u64, u64);
+ u64, u64, u64, u64, u64, u64, u64, u64, u64);
ref_root = btrfs_header_owner(buf);
ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1387,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, slot);
+ fi = btrfs_item_ptr(buf, slot,
+ struct btrfs_file_extent_item);
+
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
ret = process_func(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
- orig_generation, ref_generation,
- key.objectid);
+ btrfs_file_extent_disk_num_bytes(buf, fi),
+ orig_buf->start, buf->start,
+ orig_root, ref_root,
+ orig_generation, ref_generation,
+ key.objectid);
if (ret) {
faili = slot;
@@ -1709,7 +1407,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
goto fail;
}
} else {
- ret = process_func(trans, root, bytenr,
+ ret = process_func(trans, root, bytenr, buf->len,
orig_buf->start, buf->start,
orig_root, ref_root,
orig_generation, ref_generation,
@@ -1786,17 +1484,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
if (bytenr == 0)
continue;
ret = __btrfs_update_extent_ref(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
- orig_generation, ref_generation,
- key.objectid);
+ btrfs_file_extent_disk_num_bytes(buf, fi),
+ orig_buf->start, buf->start,
+ orig_root, ref_root, orig_generation,
+ ref_generation, key.objectid);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, slot);
ret = __btrfs_update_extent_ref(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
+ buf->len, orig_buf->start,
+ buf->start, orig_root, ref_root,
orig_generation, ref_generation,
level - 1);
if (ret)
@@ -1815,7 +1513,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache)
{
int ret;
- int pending_ret;
struct btrfs_root *extent_root = root->fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
@@ -1831,12 +1528,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(extent_root, path);
fail:
- finish_current_insert(trans, extent_root, 0);
- pending_ret = del_pending_extents(trans, extent_root, 0);
if (ret)
return ret;
- if (pending_ret)
- return pending_ret;
return 0;
}
@@ -1900,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
if (!block_group || block_group->ro)
readonly = 1;
if (block_group)
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
return readonly;
}
@@ -2151,10 +1844,14 @@ again:
printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
", %llu bytes_used, %llu bytes_reserved, "
"%llu bytes_pinned, %llu bytes_readonly, %llu may use"
- "%llu total\n", bytes, data_sinfo->bytes_delalloc,
- data_sinfo->bytes_used, data_sinfo->bytes_reserved,
- data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
- data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+ "%llu total\n", (unsigned long long)bytes,
+ (unsigned long long)data_sinfo->bytes_delalloc,
+ (unsigned long long)data_sinfo->bytes_used,
+ (unsigned long long)data_sinfo->bytes_reserved,
+ (unsigned long long)data_sinfo->bytes_pinned,
+ (unsigned long long)data_sinfo->bytes_readonly,
+ (unsigned long long)data_sinfo->bytes_may_use,
+ (unsigned long long)data_sinfo->total_bytes);
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -2225,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
spin_unlock(&info->lock);
}
+static void force_metadata_allocation(struct btrfs_fs_info *info)
+{
+ struct list_head *head = &info->space_info;
+ struct btrfs_space_info *found;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ found->force_alloc = 1;
+ }
+ rcu_read_unlock();
+}
+
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force)
{
struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
u64 thresh;
int ret = 0;
- mutex_lock(&extent_root->fs_info->chunk_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
flags = btrfs_reduce_alloc_profile(extent_root, flags);
@@ -2265,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
}
spin_unlock(&space_info->lock);
+ /*
+ * if we're doing a data chunk, go ahead and make sure that
+ * we keep a reasonable number of metadata chunks allocated in the
+ * FS as well.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_DATA) {
+ fs_info->data_chunk_allocations++;
+ if (!(fs_info->data_chunk_allocations %
+ fs_info->metadata_ratio))
+ force_metadata_allocation(fs_info);
+ }
+
ret = btrfs_alloc_chunk(trans, extent_root, flags);
if (ret)
space_info->full = 1;
@@ -2324,7 +2047,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
WARN_ON(ret);
}
}
- put_block_group(cache);
+ btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
}
@@ -2341,7 +2064,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return 0;
bytenr = cache->key.objectid;
- put_block_group(cache);
+ btrfs_put_block_group(cache);
return bytenr;
}
@@ -2353,7 +2076,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
- WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
if (pin) {
set_extent_dirty(&fs_info->pinned_extents,
bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2361,6 +2083,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
clear_extent_dirty(&fs_info->pinned_extents,
bytenr, bytenr + num - 1, GFP_NOFS);
}
+
while (num > 0) {
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache);
@@ -2385,7 +2108,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
if (cache->cached)
btrfs_add_free_space(cache, bytenr, len);
}
- put_block_group(cache);
+ btrfs_put_block_group(cache);
bytenr += len;
num -= len;
}
@@ -2416,7 +2139,7 @@ static int update_reserved_extents(struct btrfs_root *root,
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
bytenr += len;
num -= len;
}
@@ -2431,7 +2154,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
int ret;
- mutex_lock(&root->fs_info->pinned_mutex);
while (1) {
ret = find_first_extent_bit(pinned_extents, last,
&start, &end, EXTENT_DIRTY);
@@ -2440,7 +2162,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
set_extent_dirty(copy, start, end, GFP_NOFS);
last = end + 1;
}
- mutex_unlock(&root->fs_info->pinned_mutex);
return 0;
}
@@ -2452,7 +2173,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
u64 end;
int ret;
- mutex_lock(&root->fs_info->pinned_mutex);
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
@@ -2461,209 +2181,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start, end + 1 - start);
+ /* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
- if (need_resched()) {
- mutex_unlock(&root->fs_info->pinned_mutex);
- cond_resched();
- mutex_lock(&root->fs_info->pinned_mutex);
- }
+ cond_resched();
}
- mutex_unlock(&root->fs_info->pinned_mutex);
return ret;
}
-static int finish_current_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all)
-{
- u64 start;
- u64 end;
- u64 priv;
- u64 search = 0;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_path *path;
- struct pending_extent_op *extent_op, *tmp;
- struct list_head insert_list, update_list;
- int ret;
- int num_inserts = 0, max_inserts, restart = 0;
-
- path = btrfs_alloc_path();
- INIT_LIST_HEAD(&insert_list);
- INIT_LIST_HEAD(&update_list);
-
- max_inserts = extent_root->leafsize /
- (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
- sizeof(struct btrfs_extent_ref) +
- sizeof(struct btrfs_extent_item));
-again:
- mutex_lock(&info->extent_ins_mutex);
- while (1) {
- ret = find_first_extent_bit(&info->extent_ins, search, &start,
- &end, EXTENT_WRITEBACK);
- if (ret) {
- if (restart && !num_inserts &&
- list_empty(&update_list)) {
- restart = 0;
- search = 0;
- continue;
- }
- break;
- }
-
- ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
- if (!ret) {
- if (all)
- restart = 1;
- search = end + 1;
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
- continue;
- }
-
- ret = get_state_private(&info->extent_ins, start, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)(unsigned long) priv;
-
- if (extent_op->type == PENDING_EXTENT_INSERT) {
- num_inserts++;
- list_add_tail(&extent_op->list, &insert_list);
- search = end + 1;
- if (num_inserts == max_inserts) {
- restart = 1;
- break;
- }
- } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
- list_add_tail(&extent_op->list, &update_list);
- search = end + 1;
- } else {
- BUG();
- }
- }
-
- /*
- * process the update list, clear the writeback bit for it, and if
- * somebody marked this thing for deletion then just unlock it and be
- * done, the free_extents will handle it
- */
- list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
- clear_extent_bits(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- if (extent_op->del) {
- list_del_init(&extent_op->list);
- unlock_extent(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes
- - 1, GFP_NOFS);
- kfree(extent_op);
- }
- }
- mutex_unlock(&info->extent_ins_mutex);
-
- /*
- * still have things left on the update list, go ahead an update
- * everything
- */
- if (!list_empty(&update_list)) {
- ret = update_backrefs(trans, extent_root, path, &update_list);
- BUG_ON(ret);
-
- /* we may have COW'ed new blocks, so lets start over */
- if (all)
- restart = 1;
- }
-
- /*
- * if no inserts need to be done, but we skipped some extents and we
- * need to make sure everything is cleaned then reset everything and
- * go back to the beginning
- */
- if (!num_inserts && restart) {
- search = 0;
- restart = 0;
- INIT_LIST_HEAD(&update_list);
- INIT_LIST_HEAD(&insert_list);
- goto again;
- } else if (!num_inserts) {
- goto out;
- }
-
- /*
- * process the insert extents list. Again if we are deleting this
- * extent, then just unlock it, pin down the bytes if need be, and be
- * done with it. Saves us from having to actually insert the extent
- * into the tree and then subsequently come along and delete it
- */
- mutex_lock(&info->extent_ins_mutex);
- list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
- clear_extent_bits(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- if (extent_op->del) {
- u64 used;
- list_del_init(&extent_op->list);
- unlock_extent(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes
- - 1, GFP_NOFS);
-
- mutex_lock(&extent_root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root,
- extent_op->bytenr,
- extent_op->num_bytes, 0);
- mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
- spin_lock(&info->delalloc_lock);
- used = btrfs_super_bytes_used(&info->super_copy);
- btrfs_set_super_bytes_used(&info->super_copy,
- used - extent_op->num_bytes);
- used = btrfs_root_used(&extent_root->root_item);
- btrfs_set_root_used(&extent_root->root_item,
- used - extent_op->num_bytes);
- spin_unlock(&info->delalloc_lock);
-
- ret = update_block_group(trans, extent_root,
- extent_op->bytenr,
- extent_op->num_bytes,
- 0, ret > 0);
- BUG_ON(ret);
- kfree(extent_op);
- num_inserts--;
- }
- }
- mutex_unlock(&info->extent_ins_mutex);
-
- ret = insert_extents(trans, extent_root, path, &insert_list,
- num_inserts);
- BUG_ON(ret);
-
- /*
- * if restart is set for whatever reason we need to go back and start
- * searching through the pending list again.
- *
- * We just inserted some extents, which could have resulted in new
- * blocks being allocated, which would result in new blocks needing
- * updates, so if all is set we _must_ restart to get the updated
- * blocks.
- */
- if (restart || all) {
- INIT_LIST_HEAD(&insert_list);
- INIT_LIST_HEAD(&update_list);
- search = 0;
- restart = 0;
- num_inserts = 0;
- goto again;
- }
-out:
- btrfs_free_path(path);
- return 0;
-}
-
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int is_data)
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes, int is_data,
+ struct extent_buffer **must_clean)
{
int err = 0;
struct extent_buffer *buf;
@@ -2686,17 +2217,18 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
u64 header_transid = btrfs_header_generation(buf);
if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+ header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
header_transid == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- clean_tree_block(NULL, root, buf);
- btrfs_tree_unlock(buf);
- free_extent_buffer(buf);
+ *must_clean = buf;
return 1;
}
btrfs_tree_unlock(buf);
}
free_extent_buffer(buf);
pinit:
+ btrfs_set_path_blocking(path);
+ /* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
BUG_ON(err < 0);
@@ -2710,7 +2242,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 ref_generation,
- u64 owner_objectid, int pin, int mark_free)
+ u64 owner_objectid, int pin, int mark_free,
+ int refs_to_drop)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -2732,6 +2265,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = 1;
+ path->leave_spinning = 1;
ret = lookup_extent_backref(trans, extent_root, path,
bytenr, parent, root_objectid,
ref_generation, owner_objectid, 1);
@@ -2753,9 +2287,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
break;
}
if (!found_extent) {
- ret = remove_extent_backref(trans, extent_root, path);
+ ret = remove_extent_backref(trans, extent_root, path,
+ refs_to_drop);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, extent_root,
&key, path, -1, 1);
if (ret) {
@@ -2771,8 +2307,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root, path->nodes[0]);
WARN_ON(1);
printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
- "root %llu gen %llu owner %llu\n",
+ "parent %llu root %llu gen %llu owner %llu\n",
(unsigned long long)bytenr,
+ (unsigned long long)parent,
(unsigned long long)root_objectid,
(unsigned long long)ref_generation,
(unsigned long long)owner_objectid);
@@ -2782,17 +2319,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
ei = btrfs_item_ptr(leaf, extent_slot,
struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
- BUG_ON(refs == 0);
- refs -= 1;
- btrfs_set_extent_refs(leaf, ei, refs);
+ /*
+ * we're not allowed to delete the extent item if there
+ * are other delayed ref updates pending
+ */
+
+ BUG_ON(refs < refs_to_drop);
+ refs -= refs_to_drop;
+ btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
- if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+ if (refs == 0 && found_extent &&
+ path->slots[0] == extent_slot + 1) {
struct btrfs_extent_ref *ref;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_ref);
- BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+ BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
/* if the back ref and the extent are next to each other
* they get deleted below in one shot
*/
@@ -2800,11 +2343,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
num_to_del = 2;
} else if (found_extent) {
/* otherwise delete the extent back ref */
- ret = remove_extent_backref(trans, extent_root, path);
+ ret = remove_extent_backref(trans, extent_root, path,
+ refs_to_drop);
BUG_ON(ret);
/* if refs are 0, we need to setup the path for deletion */
if (refs == 0) {
btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, extent_root, &key, path,
-1, 1);
BUG_ON(ret);
@@ -2814,16 +2359,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
if (refs == 0) {
u64 super_used;
u64 root_used;
+ struct extent_buffer *must_clean = NULL;
if (pin) {
- mutex_lock(&root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, root, bytenr, num_bytes,
- owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&root->fs_info->pinned_mutex);
+ ret = pin_down_bytes(trans, root, path,
+ bytenr, num_bytes,
+ owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+ &must_clean);
if (ret > 0)
mark_free = 1;
BUG_ON(ret < 0);
}
+
/* block accounting for super block */
spin_lock(&info->delalloc_lock);
super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2382,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item,
root_used - num_bytes);
spin_unlock(&info->delalloc_lock);
+
+ /*
+ * it is going to be very rare for someone to be waiting
+ * on the block we're freeing. del_items might need to
+ * schedule, so rather than get fancy, just force it
+ * to blocking here
+ */
+ if (must_clean)
+ btrfs_set_lock_blocking(must_clean);
+
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
+
if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
BUG_ON(ret);
+ } else {
+ invalidate_mapping_pages(info->btree_inode->i_mapping,
+ bytenr >> PAGE_CACHE_SHIFT,
+ (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2417,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
btrfs_free_path(path);
- finish_current_insert(trans, extent_root, 0);
return ret;
}
/*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
+ * remove an extent from the root, returns 0 on success
*/
-static int del_pending_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin,
+ int refs_to_drop)
{
- int ret;
- int err = 0;
- u64 start;
- u64 end;
- u64 priv;
- u64 search = 0;
- int nr = 0, skipped = 0;
- struct extent_io_tree *pending_del;
- struct extent_io_tree *extent_ins;
- struct pending_extent_op *extent_op;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct list_head delete_list;
-
- INIT_LIST_HEAD(&delete_list);
- extent_ins = &extent_root->fs_info->extent_ins;
- pending_del = &extent_root->fs_info->pending_del;
-
-again:
- mutex_lock(&info->extent_ins_mutex);
- while (1) {
- ret = find_first_extent_bit(pending_del, search, &start, &end,
- EXTENT_WRITEBACK);
- if (ret) {
- if (all && skipped && !nr) {
- search = 0;
- skipped = 0;
- continue;
- }
- mutex_unlock(&info->extent_ins_mutex);
- break;
- }
-
- ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
- if (!ret) {
- search = end+1;
- skipped = 1;
-
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
-
- continue;
- }
- BUG_ON(ret < 0);
-
- ret = get_state_private(pending_del, start, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
- clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
- GFP_NOFS);
- if (!test_range_bit(extent_ins, start, end,
- EXTENT_WRITEBACK, 0)) {
- list_add_tail(&extent_op->list, &delete_list);
- nr++;
- } else {
- kfree(extent_op);
-
- ret = get_state_private(&info->extent_ins, start,
- &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
-
- clear_extent_bits(&info->extent_ins, start, end,
- EXTENT_WRITEBACK, GFP_NOFS);
-
- if (extent_op->type == PENDING_BACKREF_UPDATE) {
- list_add_tail(&extent_op->list, &delete_list);
- search = end + 1;
- nr++;
- continue;
- }
-
- mutex_lock(&extent_root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, start,
- end + 1 - start, 0);
- mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
- ret = update_block_group(trans, extent_root, start,
- end + 1 - start, 0, ret > 0);
-
- unlock_extent(extent_ins, start, end, GFP_NOFS);
- BUG_ON(ret);
- kfree(extent_op);
- }
- if (ret)
- err = ret;
-
- search = end + 1;
-
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
- }
+ WARN_ON(num_bytes < root->sectorsize);
- if (nr) {
- ret = free_extents(trans, extent_root, &delete_list);
- BUG_ON(ret);
- }
+ /*
+ * if metadata always pin
+ * if data pin when any transaction has committed this
+ */
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ ref_generation != trans->transid)
+ pin = 1;
- if (all && skipped) {
- INIT_LIST_HEAD(&delete_list);
- search = 0;
- nr = 0;
- goto again;
- }
+ if (ref_generation != trans->transid)
+ pin = 1;
- if (!err)
- finish_current_insert(trans, extent_root, 0);
- return err;
+ return __free_extent(trans, root, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid, pin, pin == 0, refs_to_drop);
}
/*
- * remove an extent from the root, returns 0 on success
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well. This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
*/
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 ref_generation,
- u64 owner_objectid, int pin)
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr)
{
- struct btrfs_root *extent_root = root->fs_info->extent_root;
- int pending_ret;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct rb_node *node;
int ret;
- WARN_ON(num_bytes < root->sectorsize);
- if (root == extent_root) {
- struct pending_extent_op *extent_op = NULL;
-
- mutex_lock(&root->fs_info->extent_ins_mutex);
- if (test_range_bit(&root->fs_info->extent_ins, bytenr,
- bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
- u64 priv;
- ret = get_state_private(&root->fs_info->extent_ins,
- bytenr, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (!head)
+ goto out;
- extent_op->del = 1;
- if (extent_op->type == PENDING_EXTENT_INSERT) {
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
- }
+ node = rb_prev(&head->node.rb_node);
+ if (!node)
+ goto out;
- if (extent_op) {
- ref_generation = extent_op->orig_generation;
- parent = extent_op->orig_parent;
- }
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_EXTENT_DELETE;
- extent_op->bytenr = bytenr;
- extent_op->num_bytes = num_bytes;
- extent_op->parent = parent;
- extent_op->orig_parent = parent;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = ref_generation;
- extent_op->level = (int)owner_objectid;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- set_extent_bits(&root->fs_info->pending_del,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->pending_del,
- bytenr, (unsigned long)extent_op);
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
- /* if metadata always pin */
- if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
- mutex_lock(&root->fs_info->pinned_mutex);
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
- mutex_unlock(&root->fs_info->pinned_mutex);
- update_reserved_extents(root, bytenr, num_bytes, 0);
- return 0;
- }
- pin = 1;
- }
+ /* there are still entries for this ref, we can't drop it */
+ if (ref->bytenr == bytenr)
+ goto out;
- /* if data pin when any transaction has committed this */
- if (ref_generation != trans->transid)
- pin = 1;
+ /*
+ * waiting for the lock here would deadlock. If someone else has it
+ * locked they are already in the process of dropping it anyway
+ */
+ if (!mutex_trylock(&head->mutex))
+ goto out;
- ret = __free_extent(trans, root, bytenr, num_bytes, parent,
- root_objectid, ref_generation,
- owner_objectid, pin, pin == 0);
+ /*
+ * at this point we have a head with no other entries. Go
+ * ahead and process it.
+ */
+ head->node.in_tree = 0;
+ rb_erase(&head->node.rb_node, &delayed_refs->root);
+
+ delayed_refs->num_entries--;
- finish_current_insert(trans, root->fs_info->extent_root, 0);
- pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
- return ret ? ret : pending_ret;
+ /*
+ * we don't take a ref on the node because we're removing it from the
+ * tree, so we just steal the ref the tree was holding.
+ */
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+
+ list_del_init(&head->cluster);
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+ &head->node, head->must_insert_reserved);
+ BUG_ON(ret);
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return 0;
}
int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2524,28 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
{
int ret;
- ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
- root_objectid, ref_generation,
- owner_objectid, pin);
+ /*
+ * tree log blocks never actually go into the extent allocation
+ * tree, just update pinning info and exit early.
+ *
+ * data extents referenced by the tree log do need to have
+ * their reference counts bumped.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+ owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ /* unlocks the pinned mutex */
+ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+ update_reserved_extents(root, bytenr, num_bytes, 0);
+ ret = 0;
+ } else {
+ ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid,
+ BTRFS_DROP_DELAYED_REF, 1);
+ BUG_ON(ret);
+ ret = check_ref_cleanup(trans, root, bytenr);
+ BUG_ON(ret);
+ }
return ret;
}
@@ -3103,228 +2574,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
- u64 total_needed = num_bytes;
- u64 *last_ptr = NULL;
- u64 last_wanted = 0;
+ struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
- int chunk_alloc_done = 0;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
- struct list_head *head = NULL, *cur = NULL;
- int loop = 0;
- int extra_loop = 0;
struct btrfs_space_info *space_info;
+ int last_ptr_loop = 0;
+ int loop = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
ins->objectid = 0;
ins->offset = 0;
+ space_info = __find_space_info(root->fs_info, data);
+
if (orig_root->ref_cows || empty_size)
allowed_chunk_alloc = 1;
if (data & BTRFS_BLOCK_GROUP_METADATA) {
- last_ptr = &root->fs_info->last_alloc;
+ last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
empty_cluster = 64 * 1024;
}
- if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
- last_ptr = &root->fs_info->last_data_alloc;
+ if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+ last_ptr = &root->fs_info->data_alloc_cluster;
+ }
if (last_ptr) {
- if (*last_ptr) {
- hint_byte = *last_ptr;
- last_wanted = *last_ptr;
- } else
- empty_size += empty_cluster;
- } else {
- empty_cluster = 0;
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ hint_byte = last_ptr->window_start;
+ spin_unlock(&last_ptr->lock);
}
+
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
- if (last_wanted && search_start != last_wanted) {
- last_wanted = 0;
- empty_size += empty_cluster;
+ if (!last_ptr) {
+ empty_cluster = 0;
+ loop = 1;
}
- total_needed += empty_size;
- block_group = btrfs_lookup_block_group(root->fs_info, search_start);
- if (!block_group)
- block_group = btrfs_lookup_first_block_group(root->fs_info,
- search_start);
- space_info = __find_space_info(root->fs_info, data);
+ if (search_start == hint_byte) {
+ block_group = btrfs_lookup_block_group(root->fs_info,
+ search_start);
+ if (block_group && block_group_bits(block_group, data)) {
+ down_read(&space_info->groups_sem);
+ goto have_block_group;
+ } else if (block_group) {
+ btrfs_put_block_group(block_group);
+ }
+ }
+search:
down_read(&space_info->groups_sem);
- while (1) {
- struct btrfs_free_space *free_space;
- /*
- * the only way this happens if our hint points to a block
- * group thats not of the proper type, while looping this
- * should never happen
- */
- if (empty_size)
- extra_loop = 1;
+ list_for_each_entry(block_group, &space_info->block_groups, list) {
+ u64 offset;
- if (!block_group)
- goto new_group_no_lock;
+ atomic_inc(&block_group->count);
+ search_start = block_group->key.objectid;
+have_block_group:
if (unlikely(!block_group->cached)) {
mutex_lock(&block_group->cache_mutex);
ret = cache_block_group(root, block_group);
mutex_unlock(&block_group->cache_mutex);
- if (ret)
+ if (ret) {
+ btrfs_put_block_group(block_group);
break;
+ }
}
- mutex_lock(&block_group->alloc_mutex);
- if (unlikely(!block_group_bits(block_group, data)))
- goto new_group;
-
if (unlikely(block_group->ro))
- goto new_group;
+ goto loop;
- free_space = btrfs_find_free_space(block_group, search_start,
- total_needed);
- if (free_space) {
- u64 start = block_group->key.objectid;
- u64 end = block_group->key.objectid +
- block_group->key.offset;
+ if (last_ptr) {
+ /*
+ * the refill lock keeps out other
+ * people trying to start a new cluster
+ */
+ spin_lock(&last_ptr->refill_lock);
+ offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+ num_bytes, search_start);
+ if (offset) {
+ /* we have a block, we're done */
+ spin_unlock(&last_ptr->refill_lock);
+ goto checks;
+ }
- search_start = stripe_align(root, free_space->offset);
+ spin_lock(&last_ptr->lock);
+ /*
+ * whoops, this cluster doesn't actually point to
+ * this block group. Get a ref on the block
+ * group is does point to and try again
+ */
+ if (!last_ptr_loop && last_ptr->block_group &&
+ last_ptr->block_group != block_group) {
+
+ btrfs_put_block_group(block_group);
+ block_group = last_ptr->block_group;
+ atomic_inc(&block_group->count);
+ spin_unlock(&last_ptr->lock);
+ spin_unlock(&last_ptr->refill_lock);
+
+ last_ptr_loop = 1;
+ search_start = block_group->key.objectid;
+ goto have_block_group;
+ }
+ spin_unlock(&last_ptr->lock);
- /* move on to the next group */
- if (search_start + num_bytes >= search_end)
- goto new_group;
+ /*
+ * this cluster didn't work out, free it and
+ * start over
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
- /* move on to the next group */
- if (search_start + num_bytes > end)
- goto new_group;
+ last_ptr_loop = 0;
- if (last_wanted && search_start != last_wanted) {
- total_needed += empty_cluster;
- empty_size += empty_cluster;
- last_wanted = 0;
+ /* allocate a cluster in this block group */
+ ret = btrfs_find_space_cluster(trans,
+ block_group, last_ptr,
+ offset, num_bytes,
+ empty_cluster + empty_size);
+ if (ret == 0) {
/*
- * if search_start is still in this block group
- * then we just re-search this block group
+ * now pull our allocation out of this
+ * cluster
*/
- if (search_start >= start &&
- search_start < end) {
- mutex_unlock(&block_group->alloc_mutex);
- continue;
+ offset = btrfs_alloc_from_cluster(block_group,
+ last_ptr, num_bytes,
+ search_start);
+ if (offset) {
+ /* we found one, proceed */
+ spin_unlock(&last_ptr->refill_lock);
+ goto checks;
}
-
- /* else we go to the next block group */
- goto new_group;
}
-
- if (exclude_nr > 0 &&
- (search_start + num_bytes > exclude_start &&
- search_start < exclude_start + exclude_nr)) {
- search_start = exclude_start + exclude_nr;
- /*
- * if search_start is still in this block group
- * then we just re-search this block group
- */
- if (search_start >= start &&
- search_start < end) {
- mutex_unlock(&block_group->alloc_mutex);
- last_wanted = 0;
- continue;
- }
-
- /* else we go to the next block group */
- goto new_group;
+ /*
+ * at this point we either didn't find a cluster
+ * or we weren't able to allocate a block from our
+ * cluster. Free the cluster we've been trying
+ * to use, and go to the next block group
+ */
+ if (loop < 2) {
+ btrfs_return_cluster_to_free_space(NULL,
+ last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
+ goto loop;
}
+ spin_unlock(&last_ptr->refill_lock);
+ }
- ins->objectid = search_start;
- ins->offset = num_bytes;
+ offset = btrfs_find_space_for_alloc(block_group, search_start,
+ num_bytes, empty_size);
+ if (!offset)
+ goto loop;
+checks:
+ search_start = stripe_align(root, offset);
- btrfs_remove_free_space_lock(block_group, search_start,
- num_bytes);
- /* we are all good, lets return */
- mutex_unlock(&block_group->alloc_mutex);
- break;
+ /* move on to the next group */
+ if (search_start + num_bytes >= search_end) {
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ goto loop;
}
-new_group:
- mutex_unlock(&block_group->alloc_mutex);
- put_block_group(block_group);
- block_group = NULL;
-new_group_no_lock:
- /* don't try to compare new allocations against the
- * last allocation any more
- */
- last_wanted = 0;
- /*
- * Here's how this works.
- * loop == 0: we were searching a block group via a hint
- * and didn't find anything, so we start at
- * the head of the block groups and keep searching
- * loop == 1: we're searching through all of the block groups
- * if we hit the head again we have searched
- * all of the block groups for this space and we
- * need to try and allocate, if we cant error out.
- * loop == 2: we allocated more space and are looping through
- * all of the block groups again.
- */
- if (loop == 0) {
- head = &space_info->block_groups;
- cur = head->next;
- loop++;
- } else if (loop == 1 && cur == head) {
- int keep_going;
-
- /* at this point we give up on the empty_size
- * allocations and just try to allocate the min
- * space.
- *
- * The extra_loop field was set if an empty_size
- * allocation was attempted above, and if this
- * is try we need to try the loop again without
- * the additional empty_size.
+ /* move on to the next group */
+ if (search_start + num_bytes >
+ block_group->key.objectid + block_group->key.offset) {
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ goto loop;
+ }
+
+ if (exclude_nr > 0 &&
+ (search_start + num_bytes > exclude_start &&
+ search_start < exclude_start + exclude_nr)) {
+ search_start = exclude_start + exclude_nr;
+
+ btrfs_add_free_space(block_group, offset, num_bytes);
+ /*
+ * if search_start is still in this block group
+ * then we just re-search this block group
*/
- total_needed -= empty_size;
- empty_size = 0;
- keep_going = extra_loop;
- loop++;
+ if (search_start >= block_group->key.objectid &&
+ search_start < (block_group->key.objectid +
+ block_group->key.offset))
+ goto have_block_group;
+ goto loop;
+ }
- if (allowed_chunk_alloc && !chunk_alloc_done) {
- up_read(&space_info->groups_sem);
- ret = do_chunk_alloc(trans, root, num_bytes +
- 2 * 1024 * 1024, data, 1);
- down_read(&space_info->groups_sem);
- if (ret < 0)
- goto loop_check;
- head = &space_info->block_groups;
- /*
- * we've allocated a new chunk, keep
- * trying
- */
- keep_going = 1;
- chunk_alloc_done = 1;
- } else if (!allowed_chunk_alloc) {
- space_info->force_alloc = 1;
- }
-loop_check:
- if (keep_going) {
- cur = head->next;
- extra_loop = 0;
- } else {
- break;
- }
- } else if (cur == head) {
- break;
+ ins->objectid = search_start;
+ ins->offset = num_bytes;
+
+ if (offset < search_start)
+ btrfs_add_free_space(block_group, offset,
+ search_start - offset);
+ BUG_ON(offset > search_start);
+
+ /* we are all good, lets return */
+ break;
+loop:
+ btrfs_put_block_group(block_group);
+ }
+ up_read(&space_info->groups_sem);
+
+ /* loop == 0, try to find a clustered alloc in every block group
+ * loop == 1, try again after forcing a chunk allocation
+ * loop == 2, set empty_size and empty_cluster to 0 and try again
+ */
+ if (!ins->objectid && loop < 3 &&
+ (empty_size || empty_cluster || allowed_chunk_alloc)) {
+ if (loop >= 2) {
+ empty_size = 0;
+ empty_cluster = 0;
}
- block_group = list_entry(cur, struct btrfs_block_group_cache,
- list);
- atomic_inc(&block_group->count);
+ if (allowed_chunk_alloc) {
+ ret = do_chunk_alloc(trans, root, num_bytes +
+ 2 * 1024 * 1024, data, 1);
+ allowed_chunk_alloc = 0;
+ } else {
+ space_info->force_alloc = 1;
+ }
- search_start = block_group->key.objectid;
- cur = cur->next;
+ if (loop < 3) {
+ loop++;
+ goto search;
+ }
+ ret = -ENOSPC;
+ } else if (!ins->objectid) {
+ ret = -ENOSPC;
}
/* we found what we needed */
@@ -3332,21 +2812,10 @@ loop_check:
if (!(data & BTRFS_BLOCK_GROUP_DATA))
trans->block_group = block_group->key.objectid;
- if (last_ptr)
- *last_ptr = ins->objectid + ins->offset;
+ btrfs_put_block_group(block_group);
ret = 0;
- } else if (!ret) {
- printk(KERN_ERR "btrfs searching for %llu bytes, "
- "num_bytes %llu, loop %d, allowed_alloc %d\n",
- (unsigned long long)total_needed,
- (unsigned long long)num_bytes,
- loop, allowed_chunk_alloc);
- ret = -ENOSPC;
}
- if (block_group)
- put_block_group(block_group);
- up_read(&space_info->groups_sem);
return ret;
}
@@ -3359,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
info->bytes_pinned - info->bytes_reserved),
(info->full) ? "" : "not ");
printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu\n", info->total_bytes,
- info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
- info->bytes_used);
+ " may_use=%llu, used=%llu\n",
+ (unsigned long long)info->total_bytes,
+ (unsigned long long)info->bytes_pinned,
+ (unsigned long long)info->bytes_delalloc,
+ (unsigned long long)info->bytes_may_use,
+ (unsigned long long)info->bytes_used);
down_read(&info->groups_sem);
list_for_each_entry(cache, &info->block_groups, list) {
@@ -3451,7 +2923,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
update_reserved_extents(root, start, len, 0);
return ret;
@@ -3475,10 +2947,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 parent,
u64 root_objectid, u64 ref_generation,
- u64 owner, struct btrfs_key *ins)
+ u64 owner, struct btrfs_key *ins,
+ int ref_mod)
{
int ret;
- int pending_ret;
u64 super_used;
u64 root_used;
u64 num_bytes = ins->offset;
@@ -3503,33 +2975,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, root_used + num_bytes);
spin_unlock(&info->delalloc_lock);
- if (root == extent_root) {
- struct pending_extent_op *extent_op;
-
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_EXTENT_INSERT;
- extent_op->bytenr = ins->objectid;
- extent_op->num_bytes = ins->offset;
- extent_op->parent = parent;
- extent_op->orig_parent = 0;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = 0;
- extent_op->level = (int)owner;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- mutex_lock(&root->fs_info->extent_ins_mutex);
- set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
- ins->objectid + ins->offset - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->extent_ins,
- ins->objectid, (unsigned long)extent_op);
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- goto update_block;
- }
-
memcpy(&keys[0], ins, sizeof(*ins));
keys[1].objectid = ins->objectid;
keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2985,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
sizes, 2);
BUG_ON(ret);
extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+ btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_extent_ref);
btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
btrfs_set_ref_objectid(path->nodes[0], ref, owner);
- btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+ btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
btrfs_mark_buffer_dirty(path->nodes[0]);
trans->alloc_exclude_start = 0;
trans->alloc_exclude_nr = 0;
btrfs_free_path(path);
- finish_current_insert(trans, extent_root, 0);
- pending_ret = del_pending_extents(trans, extent_root, 0);
if (ret)
goto out;
- if (pending_ret) {
- ret = pending_ret;
- goto out;
- }
-update_block:
ret = update_block_group(trans, root, ins->objectid,
ins->offset, 1, 0);
if (ret) {
@@ -3592,9 +3031,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
return 0;
- ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
- ref_generation, owner, ins);
- update_reserved_extents(root, ins->objectid, ins->offset, 0);
+
+ ret = btrfs_add_delayed_ref(trans, ins->objectid,
+ ins->offset, parent, root_objectid,
+ ref_generation, owner,
+ BTRFS_ADD_DELAYED_EXTENT, 0);
+ BUG_ON(ret);
return ret;
}
@@ -3619,9 +3061,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
ret = btrfs_remove_free_space(block_group, ins->objectid,
ins->offset);
BUG_ON(ret);
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
- ref_generation, owner, ins);
+ ref_generation, owner, ins, 1);
return ret;
}
@@ -3640,20 +3082,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
u64 search_end, struct btrfs_key *ins, u64 data)
{
int ret;
-
ret = __btrfs_reserve_extent(trans, root, num_bytes,
min_alloc_size, empty_size, hint_byte,
search_end, ins, data);
BUG_ON(ret);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = __btrfs_alloc_reserved_extent(trans, root, parent,
- root_objectid, ref_generation,
- owner_objectid, ins);
+ ret = btrfs_add_delayed_ref(trans, ins->objectid,
+ ins->offset, parent, root_objectid,
+ ref_generation, owner_objectid,
+ BTRFS_ADD_DELAYED_EXTENT, 0);
BUG_ON(ret);
-
- } else {
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
}
+ update_reserved_extents(root, ins->objectid, ins->offset, 1);
return ret;
}
@@ -3789,7 +3229,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- ret = __btrfs_free_extent(trans, root, disk_bytenr,
+ ret = btrfs_free_extent(trans, root, disk_bytenr,
btrfs_file_extent_disk_num_bytes(leaf, fi),
leaf->start, leaf_owner, leaf_generation,
key.objectid, 0);
@@ -3829,7 +3269,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
*/
for (i = 0; i < ref->nritems; i++) {
info = ref->extents + sorted[i].slot;
- ret = __btrfs_free_extent(trans, root, info->bytenr,
+ ret = btrfs_free_extent(trans, root, info->bytenr,
info->num_bytes, ref->bytenr,
ref->owner, ref->generation,
info->objectid, 0);
@@ -3846,12 +3286,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
return 0;
}
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 start,
u64 len, u32 *refs)
{
int ret;
- ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+ ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
BUG_ON(ret);
#if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3400,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
* we just decrement it below and don't update any
* of the refs the leaf points to.
*/
- ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ ret = drop_snap_lookup_refcount(trans, root, bytenr,
+ blocksize, &refs);
BUG_ON(ret);
if (refs != 1)
continue;
@@ -4010,7 +3452,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
*/
for (i = 0; i < refi; i++) {
bytenr = sorted[i].bytenr;
- ret = __btrfs_free_extent(trans, root, bytenr,
+ ret = btrfs_free_extent(trans, root, bytenr,
blocksize, eb->start,
root_owner, root_gen, 0, 1);
BUG_ON(ret);
@@ -4053,7 +3495,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
- ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+ ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
path->nodes[*level]->len, &refs);
BUG_ON(ret);
if (refs > 1)
@@ -4104,7 +3546,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
blocksize = btrfs_level_size(root, *level - 1);
- ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ ret = drop_snap_lookup_refcount(trans, root, bytenr,
+ blocksize, &refs);
BUG_ON(ret);
/*
@@ -4119,7 +3562,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
root_gen = btrfs_header_generation(parent);
path->slots[*level]++;
- ret = __btrfs_free_extent(trans, root, bytenr,
+ ret = btrfs_free_extent(trans, root, bytenr,
blocksize, parent->start,
root_owner, root_gen,
*level - 1, 1);
@@ -4165,7 +3608,7 @@ out:
* cleanup and free the reference on the last node
* we processed
*/
- ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize,
parent->start, root_owner, root_gen,
*level, 1);
free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3797,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_path *path;
int i;
int orig_level;
+ int update_count;
struct btrfs_root_item *root_item = &root->root_item;
WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3839,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
}
}
while (1) {
+ unsigned long update;
wret = walk_down_tree(trans, root, path, &level);
if (wret > 0)
break;
@@ -4407,12 +3852,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
break;
if (wret < 0)
ret = wret;
- if (trans->transaction->in_commit) {
+ if (trans->transaction->in_commit ||
+ trans->transaction->delayed_refs.flushing) {
ret = -EAGAIN;
break;
}
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
+ for (update_count = 0; update_count < 16; update_count++) {
+ update = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (update)
+ btrfs_run_delayed_refs(trans, root, update);
+ else
+ break;
+ }
}
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
@@ -5457,6 +4911,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
root->root_key.objectid,
trans->transid, key.objectid);
BUG_ON(ret);
+
ret = btrfs_free_extent(trans, root,
bytenr, num_bytes, leaf->start,
btrfs_header_owner(leaf),
@@ -5768,9 +5223,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
ref_path, NULL, NULL);
BUG_ON(ret);
- if (root == root->fs_info->extent_root)
- btrfs_extent_post_op(trans, root);
-
return 0;
}
@@ -6038,6 +5490,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
goto out;
@@ -6208,6 +5661,9 @@ again:
btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
mutex_unlock(&root->fs_info->cleaner_mutex);
+ trans = btrfs_start_transaction(info->tree_root, 1);
+ btrfs_commit_transaction(trans, info->tree_root);
+
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -6294,7 +5750,7 @@ next:
WARN_ON(block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
spin_unlock(&block_group->lock);
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
ret = 0;
out:
btrfs_free_path(path);
@@ -6421,9 +5877,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
- mutex_init(&cache->alloc_mutex);
+ spin_lock_init(&cache->tree_lock);
mutex_init(&cache->cache_mutex);
INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
@@ -6466,7 +5923,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
extent_root = root->fs_info->extent_root;
- root->fs_info->last_trans_new_blockgroup = trans->transid;
+ root->fs_info->last_trans_log_full_commit = trans->transid;
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
@@ -6477,9 +5934,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
- mutex_init(&cache->alloc_mutex);
+ spin_lock_init(&cache->tree_lock);
mutex_init(&cache->cache_mutex);
INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
btrfs_set_block_group_used(&cache->item, bytes_used);
btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -6500,9 +5958,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
sizeof(cache->item));
BUG_ON(ret);
- finish_current_insert(trans, extent_root, 0);
- ret = del_pending_extents(trans, extent_root, 0);
- BUG_ON(ret);
set_avail_alloc_bits(extent_root->fs_info, type);
return 0;
@@ -6542,8 +5997,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->space_info->lock);
block_group->space_info->full = 0;
- put_block_group(block_group);
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e6069..fe9eb990e443 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,12 +17,6 @@
#include "ctree.h"
#include "btrfs_inode.h"
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
- unsigned long extra_flags,
- void (*ctor)(void *, struct kmem_cache *,
- unsigned long));
-
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -50,20 +44,23 @@ struct extent_page_data {
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
- int extent_locked;
+ unsigned int extent_locked:1;
+
+ /* tells the submit_bio code to use a WRITE_SYNC */
+ unsigned int sync_io:1;
};
int __init extent_io_init(void)
{
- extent_state_cache = btrfs_cache_create("extent_state",
- sizeof(struct extent_state), 0,
- NULL);
+ extent_state_cache = kmem_cache_create("extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
- extent_buffer_cache = btrfs_cache_create("extent_buffers",
- sizeof(struct extent_buffer), 0,
- NULL);
+ extent_buffer_cache = kmem_cache_create("extent_buffers",
+ sizeof(struct extent_buffer), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
goto free_state_cache;
return 0;
@@ -1404,69 +1401,6 @@ out:
return total_bytes;
}
-#if 0
-/*
- * helper function to lock both pages and extents in the tree.
- * pages must be locked first.
- */
-static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
- struct page *page;
- int err;
-
- while (index <= end_index) {
- page = grab_cache_page(tree->mapping, index);
- if (!page) {
- err = -ENOMEM;
- goto failed;
- }
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto failed;
- }
- index++;
- }
- lock_extent(tree, start, end, GFP_NOFS);
- return 0;
-
-failed:
- /*
- * we failed above in getting the page at 'index', so we undo here
- * up to but not including the page at 'index'
- */
- end_index = index;
- index = start >> PAGE_CACHE_SHIFT;
- while (index < end_index) {
- page = find_get_page(tree->mapping, index);
- unlock_page(page);
- page_cache_release(page);
- index++;
- }
- return err;
-}
-
-/*
- * helper function to unlock both pages and extents in the tree.
- */
-static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
-{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(tree->mapping, index);
- unlock_page(page);
- page_cache_release(page);
- index++;
- }
- unlock_extent(tree, start, end, GFP_NOFS);
- return 0;
-}
-#endif
-
/*
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
@@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
return ret;
}
+static noinline void update_nr_written(struct page *page,
+ struct writeback_control *wbc,
+ unsigned long nr_written)
+{
+ wbc->nr_to_write -= nr_written;
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+ wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+ page->mapping->writeback_index = page->index + nr_written;
+}
+
/*
* the writepage semantics are similar to regular writepage. extent
* records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 delalloc_end;
int page_started;
int compressed;
+ int write_flags;
unsigned long nr_written = 0;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ write_flags = WRITE_SYNC_PLUG;
+ else
+ write_flags = WRITE;
+
WARN_ON(!PageLocked(page));
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
@@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_end = 0;
page_started = 0;
if (!epd->extent_locked) {
+ /*
+ * make sure the wbc mapping index is at least updated
+ * to this page.
+ */
+ update_nr_written(page, wbc, 0);
+
while (delalloc_end < page_end) {
nr_delalloc = find_lock_delalloc_range(inode, tree,
page,
@@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
*/
if (page_started) {
ret = 0;
- goto update_nr_written;
+ /*
+ * we've unlocked the page, so we can't update
+ * the mapping's writeback index, just update
+ * nr_to_write.
+ */
+ wbc->nr_to_write -= nr_written;
+ goto done_unlocked;
}
}
lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (ret == -EAGAIN) {
unlock_extent(tree, start, page_end, GFP_NOFS);
redirty_page_for_writepage(wbc, page);
+ update_nr_written(page, wbc, nr_written);
unlock_page(page);
ret = 0;
- goto update_nr_written;
+ goto done_unlocked;
}
}
- nr_written++;
+ /*
+ * we don't want to touch the inode after unlocking the page,
+ * so we update the mapping writeback index now
+ */
+ update_nr_written(page, wbc, nr_written + 1);
end = page_end;
if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
(unsigned long long)end);
}
- ret = submit_extent_page(WRITE, tree, page, sector,
- iosize, pg_offset, bdev,
- &epd->bio, max_nr,
+ ret = submit_extent_page(write_flags, tree, page,
+ sector, iosize, pg_offset,
+ bdev, &epd->bio, max_nr,
end_bio_extent_writepage,
0, 0, 0);
if (ret)
@@ -2336,11 +2303,8 @@ done:
unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
unlock_page(page);
-update_nr_written:
- wbc->nr_to_write -= nr_written;
- if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
- wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
- page->mapping->writeback_index = page->index + nr_written;
+done_unlocked:
+
return 0;
}
@@ -2460,15 +2424,23 @@ retry:
return ret;
}
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
{
- struct extent_page_data *epd = data;
if (epd->bio) {
- submit_one_bio(WRITE, epd->bio, 0, 0);
+ if (epd->sync_io)
+ submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+ else
+ submit_one_bio(WRITE, epd->bio, 0, 0);
epd->bio = NULL;
}
}
+static noinline void flush_write_bio(void *data)
+{
+ struct extent_page_data *epd = data;
+ flush_epd_write_bio(epd);
+}
+
int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent,
struct writeback_control *wbc)
@@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.tree = tree,
.get_extent = get_extent,
.extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
struct writeback_control wbc_writepages = {
.bdi = wbc->bdi,
- .sync_mode = WB_SYNC_NONE,
+ .sync_mode = wbc->sync_mode,
.older_than_this = NULL,
.nr_to_write = 64,
.range_start = page_offset(page) + PAGE_CACHE_SIZE,
.range_end = (loff_t)-1,
};
-
ret = __extent_writepage(page, wbc, &epd);
extent_write_cache_pages(tree, mapping, &wbc_writepages,
__extent_writepage, &epd, flush_write_bio);
- if (epd.bio)
- submit_one_bio(WRITE, epd.bio, 0, 0);
+ flush_epd_write_bio(&epd);
return ret;
}
@@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.tree = tree,
.get_extent = get_extent,
.extent_locked = 1,
+ .sync_io = mode == WB_SYNC_ALL,
};
struct writeback_control wbc_writepages = {
.bdi = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
start += PAGE_CACHE_SIZE;
}
- if (epd.bio)
- submit_one_bio(WRITE, epd.bio, 0, 0);
+ flush_epd_write_bio(&epd);
return ret;
}
@@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree,
.tree = tree,
.get_extent = get_extent,
.extent_locked = 0,
+ .sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
ret = extent_write_cache_pages(tree, mapping, wbc,
__extent_writepage, &epd,
flush_write_bio);
- if (epd.bio)
- submit_one_bio(WRITE, epd.bio, 0, 0);
+ flush_epd_write_bio(&epd);
return ret;
}
@@ -2884,25 +2855,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
disko = 0;
flags = 0;
- switch (em->block_start) {
- case EXTENT_MAP_LAST_BYTE:
+ if (em->block_start == EXTENT_MAP_LAST_BYTE) {
end = 1;
flags |= FIEMAP_EXTENT_LAST;
- break;
- case EXTENT_MAP_HOLE:
+ } else if (em->block_start == EXTENT_MAP_HOLE) {
flags |= FIEMAP_EXTENT_UNWRITTEN;
- break;
- case EXTENT_MAP_INLINE:
+ } else if (em->block_start == EXTENT_MAP_INLINE) {
flags |= (FIEMAP_EXTENT_DATA_INLINE |
FIEMAP_EXTENT_NOT_ALIGNED);
- break;
- case EXTENT_MAP_DELALLOC:
+ } else if (em->block_start == EXTENT_MAP_DELALLOC) {
flags |= (FIEMAP_EXTENT_DELALLOC |
FIEMAP_EXTENT_UNKNOWN);
- break;
- default:
+ } else {
disko = em->block_start;
- break;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
@@ -3124,20 +3089,15 @@ void free_extent_buffer(struct extent_buffer *eb)
int clear_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb)
{
- int set;
unsigned long i;
unsigned long num_pages;
struct page *page;
- u64 start = eb->start;
- u64 end = start + eb->len - 1;
-
- set = clear_extent_dirty(tree, start, end, GFP_NOFS);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if (!set && !PageDirty(page))
+ if (!PageDirty(page))
continue;
lock_page(page);
@@ -3146,22 +3106,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
else
set_page_private(page, EXTENT_PAGE_PRIVATE);
- /*
- * if we're on the last page or the first page and the
- * block isn't aligned on a page boundary, do extra checks
- * to make sure we don't clean page that is partially dirty
- */
- if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
- ((i == num_pages - 1) &&
- ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
- start = (u64)page->index << PAGE_CACHE_SHIFT;
- end = start + PAGE_CACHE_SIZE - 1;
- if (test_range_bit(tree, start, end,
- EXTENT_DIRTY, 0)) {
- unlock_page(page);
- continue;
- }
- }
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
if (!PageDirty(page)) {
@@ -3187,29 +3131,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
{
unsigned long i;
unsigned long num_pages;
+ int was_dirty = 0;
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++) {
- struct page *page = extent_buffer_page(eb, i);
- /* writepage may need to do something special for the
- * first page, we have to make sure page->private is
- * properly set. releasepage may drop page->private
- * on us if the page isn't already dirty.
- */
- lock_page(page);
- if (i == 0) {
- set_page_extent_head(page, eb->len);
- } else if (PagePrivate(page) &&
- page->private != EXTENT_PAGE_PRIVATE) {
- set_page_extent_mapped(page);
- }
+ for (i = 0; i < num_pages; i++)
__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
- set_extent_dirty(tree, page_offset(page),
- page_offset(page) + PAGE_CACHE_SIZE - 1,
- GFP_NOFS);
- unlock_page(page);
- }
- return 0;
+ return was_dirty;
}
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3717,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
ret = 0;
goto out;
}
+ if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ ret = 0;
+ goto out;
+ }
/* at this point we can safely release the extent buffer */
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
#define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
/*
* page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..30c9365861e6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -6,19 +6,14 @@
#include <linux/hardirq.h>
#include "extent_map.h"
-/* temporary define until extent_map moves out of btrfs */
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
- unsigned long extra_flags,
- void (*ctor)(void *, struct kmem_cache *,
- unsigned long));
static struct kmem_cache *extent_map_cache;
int __init extent_map_init(void)
{
- extent_map_cache = btrfs_cache_create("extent_map",
- sizeof(struct extent_map), 0,
- NULL);
+ extent_map_cache = kmem_cache_create("extent_map",
+ sizeof(struct extent_map), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!extent_map_cache)
return -ENOMEM;
return 0;
@@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
tree->map.rb_node = NULL;
spin_lock_init(&tree->lock);
}
-EXPORT_SYMBOL(extent_map_tree_init);
/**
* alloc_extent_map - allocate new extent map structure
@@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask)
atomic_set(&em->refs, 1);
return em;
}
-EXPORT_SYMBOL(alloc_extent_map);
/**
* free_extent_map - drop reference count of an extent_map
@@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em)
kmem_cache_free(extent_map_cache, em);
}
}
-EXPORT_SYMBOL(free_extent_map);
static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
struct rb_node *node)
@@ -234,7 +226,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
rb = tree_insert(&tree->map, em->start, &em->rb_node);
if (rb) {
ret = -EEXIST;
- free_extent_map(merge);
goto out;
}
atomic_inc(&em->refs);
@@ -265,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
out:
return ret;
}
-EXPORT_SYMBOL(add_extent_mapping);
/* simple helper to do math around the end of an extent, handling wrap */
static u64 range_end(u64 start, u64 len)
@@ -327,7 +317,6 @@ found:
out:
return em;
}
-EXPORT_SYMBOL(lookup_extent_mapping);
/**
* remove_extent_mapping - removes an extent_map from the extent tree
@@ -347,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
em->in_tree = 0;
return ret;
}
-EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
file_key.offset = pos;
btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
} else {
ins_size = csum_size;
}
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
+ path->leave_spinning = 0;
if (ret < 0)
goto fail_unlock;
if (ret != 0) {
@@ -776,7 +780,6 @@ found:
item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
btrfs_item_size_nr(leaf, path->slots[0]));
eb_token = NULL;
- cond_resched();
next_sector:
if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
eb_token = NULL;
}
btrfs_mark_buffer_dirty(path->nodes[0]);
- cond_resched();
if (total_bytes < sums->len) {
btrfs_release_path(root, path);
+ cond_resched();
goto again;
}
out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b3..1d51dc38bb49 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
return 0;
}
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
-{
- return 0;
-#if 0
- struct btrfs_path *path;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
- u64 last_offset = 0;
- int nritems;
- int slot;
- int found_type;
- int ret;
- int err = 0;
- u64 extent_end = 0;
-
- path = btrfs_alloc_path();
- ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
- last_offset, 0);
- while (1) {
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- goto out;
- nritems = btrfs_header_nritems(path->nodes[0]);
- }
- slot = path->slots[0];
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
- if (found_key.objectid != inode->i_ino)
- break;
- if (found_key.type != BTRFS_EXTENT_DATA_KEY)
- goto out;
-
- if (found_key.offset < last_offset) {
- WARN_ON(1);
- btrfs_print_leaf(root, leaf);
- printk(KERN_ERR "inode %lu found offset %llu "
- "expected %llu\n", inode->i_ino,
- (unsigned long long)found_key.offset,
- (unsigned long long)last_offset);
- err = 1;
- goto out;
- }
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(leaf, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- extent_end = found_key.offset +
- btrfs_file_extent_num_bytes(leaf, extent);
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- struct btrfs_item *item;
- item = btrfs_item_nr(leaf, slot);
- extent_end = found_key.offset +
- btrfs_file_extent_inline_len(leaf, extent);
- extent_end = (extent_end + root->sectorsize - 1) &
- ~((u64)root->sectorsize - 1);
- }
- last_offset = extent_end;
- path->slots[0]++;
- }
- if (0 && last_offset < inode->i_size) {
- WARN_ON(1);
- btrfs_print_leaf(root, leaf);
- printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
- inode->i_ino, (unsigned long long)last_offset,
- (unsigned long long)inode->i_size);
- err = 1;
-
- }
-out:
- btrfs_free_path(path);
- return err;
-#endif
-}
-
/*
* this is very complex, but the basic idea is to drop all extents
* in the range start - end. hint_block is filled in with a block number
@@ -363,15 +286,16 @@ out:
*/
noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+ u64 start, u64 end, u64 locked_end,
+ u64 inline_limit, u64 *hint_byte)
{
u64 extent_end = 0;
- u64 locked_end = end;
u64 search_start = start;
u64 leaf_start;
u64 ram_bytes = 0;
u64 orig_parent = 0;
u64 disk_bytenr = 0;
+ u64 orig_locked_end = locked_end;
u8 compression;
u8 encryption;
u16 other_encoding = 0;
@@ -606,6 +530,7 @@ next_slot:
btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
btrfs_release_path(root, path);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*extent));
BUG_ON(ret);
@@ -639,17 +564,22 @@ next_slot:
ram_bytes);
btrfs_set_file_extent_type(leaf, extent, found_type);
+ btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_set_lock_blocking(path->nodes[0]);
if (disk_bytenr != 0) {
ret = btrfs_update_extent_ref(trans, root,
- disk_bytenr, orig_parent,
+ disk_bytenr,
+ le64_to_cpu(old.disk_num_bytes),
+ orig_parent,
leaf->start,
root->root_key.objectid,
trans->transid, ins.objectid);
BUG_ON(ret);
}
+ path->leave_spinning = 0;
btrfs_release_path(root, path);
if (disk_bytenr != 0)
inode_add_bytes(inode, extent_end - end);
@@ -678,11 +608,10 @@ next_slot:
}
out:
btrfs_free_path(path);
- if (locked_end > end) {
- unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
- GFP_NOFS);
+ if (locked_end > orig_locked_end) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
+ locked_end - 1, GFP_NOFS);
}
- btrfs_check_file(root, inode);
return ret;
}
@@ -824,7 +753,7 @@ again:
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
BUG_ON(ret);
- goto done;
+ goto release;
} else if (split == start) {
if (locked_end < extent_end) {
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -912,7 +841,7 @@ again:
btrfs_set_file_extent_other_encoding(leaf, fi, 0);
if (orig_parent != leaf->start) {
- ret = btrfs_update_extent_ref(trans, root, bytenr,
+ ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
orig_parent, leaf->start,
root->root_key.objectid,
trans->transid, inode->i_ino);
@@ -920,6 +849,8 @@ again:
}
done:
btrfs_mark_buffer_dirty(leaf);
+
+release:
btrfs_release_path(root, path);
if (split_end && split == start) {
split = end;
@@ -1125,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
if (will_write) {
btrfs_fdatawrite_range(inode->i_mapping, pos,
pos + write_bytes - 1,
- WB_SYNC_NONE);
+ WB_SYNC_ALL);
} else {
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
num_pages);
@@ -1155,6 +1086,20 @@ out_nolock:
page_cache_release(pinned[1]);
*ppos = pos;
+ /*
+ * we want to make sure fsync finds this change
+ * but we haven't joined a transaction running right now.
+ *
+ * Later on, someone is sure to update the inode and get the
+ * real transid recorded.
+ *
+ * We set last_trans now to the fs_info generation + 1,
+ * this will either be one more than the running transaction
+ * or the generation used for the next transaction if there isn't
+ * one running right now.
+ */
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
if (num_written > 0 && will_write) {
struct btrfs_trans_handle *trans;
@@ -1167,8 +1112,11 @@ out_nolock:
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
if (ret == 0) {
- btrfs_sync_log(trans, root);
- btrfs_end_transaction(trans, root);
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ btrfs_end_transaction(trans, root);
+ else
+ btrfs_commit_transaction(trans, root);
} else {
btrfs_commit_transaction(trans, root);
}
@@ -1185,6 +1133,18 @@ out_nolock:
int btrfs_release_file(struct inode *inode, struct file *filp)
{
+ /*
+ * ordered_data_close is set by settattr when we are about to truncate
+ * a file from a non-zero size to a zero size. This tries to
+ * flush down new bytes that may have been written if the
+ * application were using truncate to replace a file in place.
+ */
+ if (BTRFS_I(inode)->ordered_data_close) {
+ BTRFS_I(inode)->ordered_data_close = 0;
+ btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(inode->i_mapping);
+ }
if (filp->private_data)
btrfs_ioctl_trans_end(filp);
return 0;
@@ -1260,8 +1220,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (ret > 0) {
ret = btrfs_commit_transaction(trans, root);
} else {
- btrfs_sync_log(trans, root);
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..0bc93657b460 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
#include <linux/sched.h>
#include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+
+struct btrfs_free_space {
+ struct rb_node bytes_index;
+ struct rb_node offset_index;
+ u64 offset;
+ u64 bytes;
+};
static int tree_insert_offset(struct rb_root *root, u64 offset,
struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
}
/*
- * searches the tree for the given offset. If contains is set we will return
- * the free space that contains the given offset. If contains is not set we
- * will return the free space that starts at or after the given offset and is
- * at least bytes long.
+ * searches the tree for the given offset.
+ *
+ * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * to look for free space. Because the hint may not be completely on an offset
+ * mark, or the hint may no longer point to free space we need to fudge our
+ * results a bit. So we look for free space starting at or after offset with at
+ * least bytes size. We prefer to find as close to the given offset as we can.
+ * Also if the offset is within a free space range, then we will return the free
+ * space that contains the given offset, which means we can return a free space
+ * chunk with an offset before the provided offset.
+ *
+ * fuzzy == 0: this is just a normal tree search. Give us the free space that
+ * starts at the given offset which is at least bytes size, and if its not there
+ * return NULL.
*/
static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
u64 offset, u64 bytes,
- int contains)
+ int fuzzy)
{
struct rb_node *n = root->rb_node;
struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
entry = rb_entry(n, struct btrfs_free_space, offset_index);
if (offset < entry->offset) {
- if (!contains &&
+ if (fuzzy &&
(!ret || entry->offset < ret->offset) &&
(bytes <= entry->bytes))
ret = entry;
n = n->rb_left;
} else if (offset > entry->offset) {
- if ((entry->offset + entry->bytes - 1) >= offset &&
+ if (fuzzy &&
+ (entry->offset + entry->bytes - 1) >= offset &&
bytes <= entry->bytes) {
ret = entry;
break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
int ret = 0;
+ BUG_ON(!info->bytes);
ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
&info->offset_index);
if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
return ret;
}
-static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes)
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
{
struct btrfs_free_space *right_info;
struct btrfs_free_space *left_info;
struct btrfs_free_space *info = NULL;
- struct btrfs_free_space *alloc_info;
int ret = 0;
- alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
- if (!alloc_info)
+ info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+ if (!info)
return -ENOMEM;
+ info->offset = offset;
+ info->bytes = bytes;
+
+ spin_lock(&block_group->tree_lock);
+
/*
* first we want to see if there is free space adjacent to the range we
* are adding, if there is remove that struct and add a new one to
* cover the entire range
*/
right_info = tree_search_offset(&block_group->free_space_offset,
- offset+bytes, 0, 1);
+ offset+bytes, 0, 0);
left_info = tree_search_offset(&block_group->free_space_offset,
offset-1, 0, 1);
- if (right_info && right_info->offset == offset+bytes) {
+ if (right_info) {
unlink_free_space(block_group, right_info);
- info = right_info;
- info->offset = offset;
- info->bytes += bytes;
- } else if (right_info && right_info->offset != offset+bytes) {
- printk(KERN_ERR "btrfs adding space in the middle of an "
- "existing free space area. existing: "
- "offset=%llu, bytes=%llu. new: offset=%llu, "
- "bytes=%llu\n", (unsigned long long)right_info->offset,
- (unsigned long long)right_info->bytes,
- (unsigned long long)offset,
- (unsigned long long)bytes);
- BUG();
+ info->bytes += right_info->bytes;
+ kfree(right_info);
}
- if (left_info) {
+ if (left_info && left_info->offset + left_info->bytes == offset) {
unlink_free_space(block_group, left_info);
-
- if (unlikely((left_info->offset + left_info->bytes) !=
- offset)) {
- printk(KERN_ERR "btrfs free space to the left "
- "of new free space isn't "
- "quite right. existing: offset=%llu, "
- "bytes=%llu. new: offset=%llu, bytes=%llu\n",
- (unsigned long long)left_info->offset,
- (unsigned long long)left_info->bytes,
- (unsigned long long)offset,
- (unsigned long long)bytes);
- BUG();
- }
-
- if (info) {
- info->offset = left_info->offset;
- info->bytes += left_info->bytes;
- kfree(left_info);
- } else {
- info = left_info;
- info->bytes += bytes;
- }
+ info->offset = left_info->offset;
+ info->bytes += left_info->bytes;
+ kfree(left_info);
}
- if (info) {
- ret = link_free_space(block_group, info);
- if (!ret)
- info = NULL;
- goto out;
- }
-
- info = alloc_info;
- alloc_info = NULL;
- info->offset = offset;
- info->bytes = bytes;
-
ret = link_free_space(block_group, info);
if (ret)
kfree(info);
-out:
+
+ spin_unlock(&block_group->tree_lock);
+
if (ret) {
printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
- if (ret == -EEXIST)
- BUG();
+ BUG_ON(ret == -EEXIST);
}
- kfree(alloc_info);
-
return ret;
}
-static int
-__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes)
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
{
struct btrfs_free_space *info;
int ret = 0;
+ spin_lock(&block_group->tree_lock);
+
info = tree_search_offset(&block_group->free_space_offset, offset, 0,
1);
-
if (info && info->offset == offset) {
if (info->bytes < bytes) {
printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
(unsigned long long)bytes);
WARN_ON(1);
ret = -EINVAL;
+ spin_unlock(&block_group->tree_lock);
goto out;
}
unlink_free_space(block_group, info);
if (info->bytes == bytes) {
kfree(info);
+ spin_unlock(&block_group->tree_lock);
goto out;
}
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
info->bytes -= bytes;
ret = link_free_space(block_group, info);
+ spin_unlock(&block_group->tree_lock);
BUG_ON(ret);
} else if (info && info->offset < offset &&
info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,37 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
*/
kfree(info);
}
-
+ spin_unlock(&block_group->tree_lock);
/* step two, insert a new info struct to cover anything
* before the hole
*/
- ret = __btrfs_add_free_space(block_group, old_start,
- offset - old_start);
+ ret = btrfs_add_free_space(block_group, old_start,
+ offset - old_start);
BUG_ON(ret);
} else {
+ spin_unlock(&block_group->tree_lock);
+ if (!info) {
+ printk(KERN_ERR "couldn't find space %llu to free\n",
+ (unsigned long long)offset);
+ printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
+ block_group->cached,
+ (unsigned long long)block_group->key.objectid,
+ (unsigned long long)block_group->key.offset);
+ btrfs_dump_free_space(block_group, bytes);
+ } else if (info) {
+ printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
+ "but wanted offset=%llu bytes=%llu\n",
+ (unsigned long long)info->offset,
+ (unsigned long long)info->bytes,
+ (unsigned long long)offset,
+ (unsigned long long)bytes);
+ }
WARN_ON(1);
}
out:
return ret;
}
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes)
-{
- int ret;
- struct btrfs_free_space *sp;
-
- mutex_lock(&block_group->alloc_mutex);
- ret = __btrfs_add_free_space(block_group, offset, bytes);
- sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
- BUG_ON(!sp);
- mutex_unlock(&block_group->alloc_mutex);
-
- return ret;
-}
-
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes)
-{
- int ret;
- struct btrfs_free_space *sp;
-
- ret = __btrfs_add_free_space(block_group, offset, bytes);
- sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
- BUG_ON(!sp);
-
- return ret;
-}
-
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes)
-{
- int ret = 0;
-
- mutex_lock(&block_group->alloc_mutex);
- ret = __btrfs_remove_free_space(block_group, offset, bytes);
- mutex_unlock(&block_group->alloc_mutex);
-
- return ret;
-}
-
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes)
-{
- int ret;
-
- ret = __btrfs_remove_free_space(block_group, offset, bytes);
-
- return ret;
-}
-
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
u64 bytes)
{
@@ -408,6 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes)
count++;
+ printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+ (unsigned long long)info->offset,
+ (unsigned long long)info->bytes);
}
printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
"\n", count);
@@ -428,68 +384,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
return ret;
}
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache. If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already. In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster)
+{
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+
+ spin_lock(&cluster->lock);
+ if (cluster->block_group != block_group)
+ goto out;
+
+ cluster->window_start = 0;
+ node = rb_first(&cluster->root);
+ while(node) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ node = rb_next(&entry->offset_index);
+ rb_erase(&entry->offset_index, &cluster->root);
+ link_free_space(block_group, entry);
+ }
+ list_del_init(&cluster->block_group_list);
+
+ btrfs_put_block_group(cluster->block_group);
+ cluster->block_group = NULL;
+ cluster->root.rb_node = NULL;
+out:
+ spin_unlock(&cluster->lock);
+ return 0;
+}
+
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
{
struct btrfs_free_space *info;
struct rb_node *node;
+ struct btrfs_free_cluster *cluster;
+ struct btrfs_free_cluster *safe;
+
+ spin_lock(&block_group->tree_lock);
+
+ list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+ block_group_list) {
+
+ WARN_ON(cluster->block_group != block_group);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
+ }
- mutex_lock(&block_group->alloc_mutex);
while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
info = rb_entry(node, struct btrfs_free_space, bytes_index);
unlink_free_space(block_group, info);
kfree(info);
if (need_resched()) {
- mutex_unlock(&block_group->alloc_mutex);
+ spin_unlock(&block_group->tree_lock);
cond_resched();
- mutex_lock(&block_group->alloc_mutex);
+ spin_lock(&block_group->tree_lock);
}
}
- mutex_unlock(&block_group->alloc_mutex);
+ spin_unlock(&block_group->tree_lock);
}
-#if 0
-static struct btrfs_free_space *btrfs_find_free_space_offset(struct
- btrfs_block_group_cache
- *block_group, u64 offset,
- u64 bytes)
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes, u64 empty_size)
{
- struct btrfs_free_space *ret;
+ struct btrfs_free_space *entry = NULL;
+ u64 ret = 0;
- mutex_lock(&block_group->alloc_mutex);
- ret = tree_search_offset(&block_group->free_space_offset, offset,
- bytes, 0);
- mutex_unlock(&block_group->alloc_mutex);
+ spin_lock(&block_group->tree_lock);
+ entry = tree_search_offset(&block_group->free_space_offset, offset,
+ bytes + empty_size, 1);
+ if (!entry)
+ entry = tree_search_bytes(&block_group->free_space_bytes,
+ offset, bytes + empty_size);
+ if (entry) {
+ unlink_free_space(block_group, entry);
+ ret = entry->offset;
+ entry->offset += bytes;
+ entry->bytes -= bytes;
+
+ if (!entry->bytes)
+ kfree(entry);
+ else
+ link_free_space(block_group, entry);
+ }
+ spin_unlock(&block_group->tree_lock);
return ret;
}
-static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
- btrfs_block_group_cache
- *block_group, u64 offset,
- u64 bytes)
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache. If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster)
{
- struct btrfs_free_space *ret;
+ int ret;
- mutex_lock(&block_group->alloc_mutex);
+ /* first, get a safe pointer to the block group */
+ spin_lock(&cluster->lock);
+ if (!block_group) {
+ block_group = cluster->block_group;
+ if (!block_group) {
+ spin_unlock(&cluster->lock);
+ return 0;
+ }
+ } else if (cluster->block_group != block_group) {
+ /* someone else has already freed it don't redo their work */
+ spin_unlock(&cluster->lock);
+ return 0;
+ }
+ atomic_inc(&block_group->count);
+ spin_unlock(&cluster->lock);
- ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
- mutex_unlock(&block_group->alloc_mutex);
+ /* now return any extents the cluster had on it */
+ spin_lock(&block_group->tree_lock);
+ ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+ spin_unlock(&block_group->tree_lock);
+ /* finally drop our ref */
+ btrfs_put_block_group(block_group);
return ret;
}
-#endif
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
- *block_group, u64 offset,
- u64 bytes)
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start)
{
- struct btrfs_free_space *ret = NULL;
+ struct btrfs_free_space *entry = NULL;
+ struct rb_node *node;
+ u64 ret = 0;
- ret = tree_search_offset(&block_group->free_space_offset, offset,
- bytes, 0);
- if (!ret)
- ret = tree_search_bytes(&block_group->free_space_bytes,
- offset, bytes);
+ spin_lock(&cluster->lock);
+ if (bytes > cluster->max_size)
+ goto out;
+ if (cluster->block_group != block_group)
+ goto out;
+
+ node = rb_first(&cluster->root);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ while(1) {
+ if (entry->bytes < bytes || entry->offset < min_start) {
+ struct rb_node *node;
+
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ }
+ ret = entry->offset;
+
+ entry->offset += bytes;
+ entry->bytes -= bytes;
+
+ if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
+ kfree(entry);
+ }
+ break;
+ }
+out:
+ spin_unlock(&cluster->lock);
return ret;
}
+
+/*
+ * here we try to find a cluster of blocks in a block group. The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size)
+{
+ struct btrfs_free_space *entry = NULL;
+ struct rb_node *node;
+ struct btrfs_free_space *next;
+ struct btrfs_free_space *last;
+ u64 min_bytes;
+ u64 window_start;
+ u64 window_free;
+ u64 max_extent = 0;
+ int total_retries = 0;
+ int ret;
+
+ /* for metadata, allow allocates with more holes */
+ if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ /*
+ * we want to do larger allocations when we are
+ * flushing out the delayed refs, it helps prevent
+ * making more work as we go along.
+ */
+ if (trans->transaction->delayed_refs.flushing)
+ min_bytes = max(bytes, (bytes + empty_size) >> 1);
+ else
+ min_bytes = max(bytes, (bytes + empty_size) >> 4);
+ } else
+ min_bytes = max(bytes, (bytes + empty_size) >> 2);
+
+ spin_lock(&block_group->tree_lock);
+ spin_lock(&cluster->lock);
+
+ /* someone already found a cluster, hooray */
+ if (cluster->block_group) {
+ ret = 0;
+ goto out;
+ }
+again:
+ min_bytes = min(min_bytes, bytes + empty_size);
+ entry = tree_search_bytes(&block_group->free_space_bytes,
+ offset, min_bytes);
+ if (!entry) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ window_start = entry->offset;
+ window_free = entry->bytes;
+ last = entry;
+ max_extent = entry->bytes;
+
+ while(1) {
+ /* out window is just right, lets fill it */
+ if (window_free >= bytes + empty_size)
+ break;
+
+ node = rb_next(&last->offset_index);
+ if (!node) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ next = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ /*
+ * we haven't filled the empty size and the window is
+ * very large. reset and try again
+ */
+ if (next->offset - window_start > (bytes + empty_size) * 2) {
+ entry = next;
+ window_start = entry->offset;
+ window_free = entry->bytes;
+ last = entry;
+ max_extent = 0;
+ total_retries++;
+ if (total_retries % 256 == 0) {
+ if (min_bytes >= (bytes + empty_size)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ /*
+ * grow our allocation a bit, we're not having
+ * much luck
+ */
+ min_bytes *= 2;
+ goto again;
+ }
+ } else {
+ last = next;
+ window_free += next->bytes;
+ if (entry->bytes > max_extent)
+ max_extent = entry->bytes;
+ }
+ }
+
+ cluster->window_start = entry->offset;
+
+ /*
+ * now we've found our entries, pull them out of the free space
+ * cache and put them into the cluster rbtree
+ *
+ * The cluster includes an rbtree, but only uses the offset index
+ * of each free space cache entry.
+ */
+ while(1) {
+ node = rb_next(&entry->offset_index);
+ unlink_free_space(block_group, entry);
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index);
+ BUG_ON(ret);
+
+ if (!node || entry == last)
+ break;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ }
+ ret = 0;
+ cluster->max_size = max_extent;
+ atomic_inc(&block_group->count);
+ list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+ cluster->block_group = block_group;
+out:
+ spin_unlock(&cluster->lock);
+ spin_unlock(&block_group->tree_lock);
+
+ return ret;
+}
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+ spin_lock_init(&cluster->lock);
+ spin_lock_init(&cluster->refill_lock);
+ cluster->root.rb_node = NULL;
+ cluster->max_size = 0;
+ INIT_LIST_HEAD(&cluster->block_group_list);
+ cluster->block_group = NULL;
+}
+
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+ *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes, u64 empty_size);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster, u64 bytes,
+ u64 min_start);
+int btrfs_return_cluster_to_free_space(
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster);
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cc7334d833c9..9abbced1123d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
BUG_ON(!path);
- search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+ search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
search_key.objectid = search_start;
search_key.type = 0;
search_key.offset = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..90c23eb28829 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
-struct kmem_cache *btrfs_bit_radix_cachep;
struct kmem_cache *btrfs_path_cachep;
#define S_SHIFT 12
@@ -134,6 +133,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
btrfs_set_trans_block_group(trans, inode);
key.objectid = inode->i_ino;
@@ -167,9 +167,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_CACHE_SIZE);
- kaddr = kmap(cpage);
+ kaddr = kmap_atomic(cpage, KM_USER0);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap(cpage);
+ kunmap_atomic(kaddr, KM_USER0);
i++;
ptr += cur_size;
@@ -204,7 +204,7 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end,
size_t compressed_size,
@@ -233,7 +233,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
}
ret = btrfs_drop_extents(trans, root, inode, start,
- aligned_end, start, &hint_byte);
+ aligned_end, aligned_end, start, &hint_byte);
BUG_ON(ret);
if (isize > actual_end)
@@ -854,11 +854,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
int limit = 10 * 1024 * 1042;
- if (!btrfs_test_opt(root, COMPRESS)) {
- return cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
- }
-
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
EXTENT_DELALLOC, 1, 0, GFP_NOFS);
while (start < end) {
@@ -935,7 +930,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+ struct page *locked_page,
u64 start, u64 end, int *page_started, int force,
unsigned long *nr_written)
{
@@ -1133,6 +1129,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
unsigned long *nr_written)
{
int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_test_flag(inode, NODATACOW))
ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1137,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
else if (btrfs_test_flag(inode, PREALLOC))
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
+ else if (!btrfs_test_opt(root, COMPRESS))
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
else
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
-
return ret;
}
@@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct inode *inode, u64 file_pos,
u64 disk_bytenr, u64 disk_num_bytes,
u64 num_bytes, u64 ram_bytes,
+ u64 locked_end,
u8 compression, u8 encryption,
u16 other_encoding, int extent_type)
{
@@ -1453,8 +1453,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_drop_extents(trans, root, inode, file_pos,
- file_pos + num_bytes, file_pos, &hint);
+ file_pos + num_bytes, locked_end,
+ file_pos, &hint);
BUG_ON(ret);
ins.objectid = inode->i_ino;
@@ -1475,6 +1477,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, fi, compression);
btrfs_set_file_extent_encryption(leaf, fi, encryption);
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_set_lock_blocking(leaf);
+
btrfs_mark_buffer_dirty(leaf);
inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1493,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
root->root_key.objectid,
trans->transid, inode->i_ino, &ins);
BUG_ON(ret);
-
btrfs_free_path(path);
+
return 0;
}
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction. It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_ordered_extent *ordered_extent)
+{
+ struct btrfs_ordered_sum *sum;
+ u64 bytenr;
+
+ sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+ list);
+ bytenr = sum->sums[0].bytenr;
+
+ /*
+ * we don't care about the results, the point of this search is
+ * just to get the btree leaves into ram
+ */
+ btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -1500,8 +1530,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- struct btrfs_ordered_extent *ordered_extent;
+ struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_path *path;
int compressed = 0;
int ret;
@@ -1509,9 +1540,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
if (!ret)
return 0;
+ /*
+ * before we join the transaction, try to do some of our IO.
+ * This will limit the amount of IO that we have to do with
+ * the transaction running. We're unlikely to need to do any
+ * IO if the file extents are new, the disk_i_size checks
+ * covers the most common case.
+ */
+ if (start < BTRFS_I(inode)->disk_i_size) {
+ path = btrfs_alloc_path();
+ if (path) {
+ ret = btrfs_lookup_file_extent(NULL, root, path,
+ inode->i_ino,
+ start, 0);
+ ordered_extent = btrfs_lookup_ordered_extent(inode,
+ start);
+ if (!list_empty(&ordered_extent->list)) {
+ btrfs_release_path(root, path);
+ reada_csum(root, path, ordered_extent);
+ }
+ btrfs_free_path(path);
+ }
+ }
+
trans = btrfs_join_transaction(root, 1);
- ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+ if (!ordered_extent)
+ ordered_extent = btrfs_lookup_ordered_extent(inode, start);
BUG_ON(!ordered_extent);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
goto nocow;
@@ -1536,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->disk_len,
ordered_extent->len,
ordered_extent->len,
+ ordered_extent->file_offset +
+ ordered_extent->len,
compressed, 0, 0,
BTRFS_FILE_EXTENT_REG);
BUG_ON(ret);
@@ -1765,10 +1822,12 @@ good:
return 0;
zeroit:
- printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
- "private %llu\n", page->mapping->host->i_ino,
- (unsigned long long)start, csum,
- (unsigned long long)private);
+ if (printk_ratelimit()) {
+ printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+ "private %llu\n", page->mapping->host->i_ino,
+ (unsigned long long)start, csum,
+ (unsigned long long)private);
+ }
memset(kaddr + offset, 1, end - start + 1);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
@@ -1957,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
}
/*
+ * very simple check to peek ahead in the leaf looking for xattrs. If we
+ * don't find any xattrs, we know there can't be any acls.
+ *
+ * slot is the slot the inode is in, objectid is the objectid of the inode
+ */
+static noinline int acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid)
+{
+ u32 nritems = btrfs_header_nritems(leaf);
+ struct btrfs_key found_key;
+ int scanned = 0;
+
+ slot++;
+ while (slot < nritems) {
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* we found a different objectid, there must not be acls */
+ if (found_key.objectid != objectid)
+ return 0;
+
+ /* we found an xattr, assume we've got an acl */
+ if (found_key.type == BTRFS_XATTR_ITEM_KEY)
+ return 1;
+
+ /*
+ * we found a key greater than an xattr key, there can't
+ * be any acls later on
+ */
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+ return 0;
+
+ slot++;
+ scanned++;
+
+ /*
+ * it goes inode, inode backrefs, xattrs, extents,
+ * so if there are a ton of hard links to an inode there can
+ * be a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization
+ */
+ if (scanned >= 8)
+ break;
+ }
+ /* we hit the end of the leaf before we found an xattr or
+ * something larger than an xattr. We have to assume the inode
+ * has acls
+ */
+ return 1;
+}
+
+/*
* read an inode from the btree into the in-memory inode
*/
void btrfs_read_locked_inode(struct inode *inode)
@@ -1967,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode)
struct btrfs_timespec *tspec;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key location;
+ int maybe_acls;
u64 alloc_group_block;
u32 rdev;
int ret;
@@ -2013,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode)
alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+ /*
+ * try to precache a NULL acl entry for files that don't have
+ * any xattrs or acls
+ */
+ maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+ if (!maybe_acls) {
+ BTRFS_I(inode)->i_acl = NULL;
+ BTRFS_I(inode)->i_default_acl = NULL;
+ }
+
BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
alloc_group_block, 0);
btrfs_free_path(path);
@@ -2101,6 +2222,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_lookup_inode(trans, root, path,
&BTRFS_I(inode)->location, 1);
if (ret) {
@@ -2147,6 +2269,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
+ path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
name, name_len, -1);
if (IS_ERR(di)) {
@@ -2190,8 +2313,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir->i_ino);
BUG_ON(ret != 0 && ret != -ENOENT);
- if (ret != -ENOENT)
- BTRFS_I(dir)->log_dirty_trans = trans->transid;
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
dir, index);
@@ -2224,6 +2345,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
+
+ btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
@@ -2498,6 +2622,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
key.type = (u8)-1;
search_again:
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto error;
@@ -2644,6 +2769,7 @@ delete:
break;
}
if (found_extent) {
+ btrfs_set_path_blocking(path);
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes,
leaf->start, root_owner,
@@ -2818,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
err = btrfs_drop_extents(trans, root, inode,
cur_offset,
cur_offset + hole_size,
+ block_end,
cur_offset, &hint_byte);
if (err)
break;
@@ -2848,11 +2975,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
- err = btrfs_cont_expand(inode, attr->ia_size);
- if (err)
- return err;
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ if (attr->ia_size > inode->i_size) {
+ err = btrfs_cont_expand(inode, attr->ia_size);
+ if (err)
+ return err;
+ } else if (inode->i_size > 0 &&
+ attr->ia_size == 0) {
+
+ /* we're truncating a file that used to have good
+ * data down to zero. Make sure it gets into
+ * the ordered flush list so that any new writes
+ * get down to disk quickly.
+ */
+ BTRFS_I(inode)->ordered_data_close = 1;
+ }
}
err = inode_setattr(inode, attr);
@@ -2972,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode)
{
struct btrfs_inode *bi = BTRFS_I(inode);
- bi->i_acl = NULL;
- bi->i_default_acl = NULL;
+ bi->i_acl = BTRFS_ACL_NOT_CACHED;
+ bi->i_default_acl = BTRFS_ACL_NOT_CACHED;
bi->generation = 0;
bi->sequence = 0;
@@ -2984,13 +3121,14 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->disk_i_size = 0;
bi->flags = 0;
bi->index_cnt = (u64)-1;
- bi->log_dirty_trans = 0;
+ bi->last_unlink_trans = 0;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_tree,
inode->i_mapping, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
inode->i_mapping, GFP_NOFS);
INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+ INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
mutex_init(&BTRFS_I(inode)->extent_mutex);
mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3411,8 +3549,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (dir) {
ret = btrfs_set_inode_index(dir, index);
- if (ret)
+ if (ret) {
+ iput(inode);
return ERR_PTR(ret);
+ }
}
/*
* index_cnt is ignored for everything but a dir,
@@ -3449,6 +3589,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[0] = sizeof(struct btrfs_inode_item);
sizes[1] = name_len + sizeof(*ref);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
if (ret != 0)
goto fail;
@@ -3494,6 +3635,7 @@ fail:
if (dir)
BTRFS_I(dir)->index_cnt--;
btrfs_free_path(path);
+ iput(inode);
return ERR_PTR(ret);
}
@@ -3727,6 +3869,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
nr = trans->blocks_used;
+
+ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root);
fail:
if (drop_inode) {
@@ -4292,8 +4436,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*/
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
struct inode *inode = fdentry(vma->vm_file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4451,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
u64 page_end;
ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret)
+ if (ret) {
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else /* -ENOSPC, -EIO, etc */
+ ret = VM_FAULT_SIGBUS;
goto out;
+ }
- ret = -EINVAL;
+ ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
size = i_size_read(inode);
@@ -4357,6 +4507,8 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
+
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
@@ -4382,6 +4534,27 @@ static void btrfs_truncate(struct inode *inode)
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
trans = btrfs_start_transaction(root, 1);
+
+ /*
+ * setattr is responsible for setting the ordered_data_close flag,
+ * but that is only tested during the last file release. That
+ * could happen well after the next commit, leaving a great big
+ * window where new writes may get lost if someone chooses to write
+ * to this file after truncating to zero
+ *
+ * The inode doesn't have any dirty data here, and so if we commit
+ * this is a noop. If someone immediately starts writing to the inode
+ * it is very likely we'll catch some of their writes in this
+ * transaction, and the commit will find this file on the ordered
+ * data list with good things to send down.
+ *
+ * This is a best effort solution, there is still a window where
+ * using truncate to replace the contents of the file will
+ * end up with a zero length file after a crash.
+ */
+ if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+ btrfs_add_ordered_operation(trans, root, inode);
+
btrfs_set_trans_block_group(trans, inode);
btrfs_i_size_write(inode, inode->i_size);
@@ -4458,12 +4631,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->i_acl = BTRFS_ACL_NOT_CACHED;
ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->ordered_operations);
return &ei->vfs_inode;
}
void btrfs_destroy_inode(struct inode *inode)
{
struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
@@ -4474,13 +4650,24 @@ void btrfs_destroy_inode(struct inode *inode)
BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
posix_acl_release(BTRFS_I(inode)->i_default_acl);
- spin_lock(&BTRFS_I(inode)->root->list_lock);
+ /*
+ * Make sure we're properly removed from the ordered operation
+ * lists.
+ */
+ smp_mb();
+ if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ }
+
+ spin_lock(&root->list_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
" list\n", inode->i_ino);
dump_stack();
}
- spin_unlock(&BTRFS_I(inode)->root->list_lock);
+ spin_unlock(&root->list_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4515,47 +4702,36 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_trans_handle_cachep);
if (btrfs_transaction_cachep)
kmem_cache_destroy(btrfs_transaction_cachep);
- if (btrfs_bit_radix_cachep)
- kmem_cache_destroy(btrfs_bit_radix_cachep);
if (btrfs_path_cachep)
kmem_cache_destroy(btrfs_path_cachep);
}
-struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
- unsigned long extra_flags,
- void (*ctor)(void *))
-{
- return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD | extra_flags), ctor);
-}
-
int btrfs_init_cachep(void)
{
- btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
- sizeof(struct btrfs_inode),
- 0, init_once);
+ btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+ sizeof(struct btrfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
if (!btrfs_inode_cachep)
goto fail;
- btrfs_trans_handle_cachep =
- btrfs_cache_create("btrfs_trans_handle_cache",
- sizeof(struct btrfs_trans_handle),
- 0, NULL);
+
+ btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+ sizeof(struct btrfs_trans_handle), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_trans_handle_cachep)
goto fail;
- btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
- sizeof(struct btrfs_transaction),
- 0, NULL);
+
+ btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+ sizeof(struct btrfs_transaction), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_transaction_cachep)
goto fail;
- btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
- sizeof(struct btrfs_path),
- 0, NULL);
+
+ btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+ sizeof(struct btrfs_path), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
if (!btrfs_path_cachep)
goto fail;
- btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
- SLAB_DESTROY_BY_RCU, NULL);
- if (!btrfs_bit_radix_cachep)
- goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -4605,8 +4781,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_unlock;
+ /*
+ * we're using rename to replace one file with another.
+ * and the replacement file is large. Start IO on it now so
+ * we don't add too much work to the end of the transaction
+ */
+ if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+ new_inode->i_size &&
+ old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(old_inode->i_mapping);
+
trans = btrfs_start_transaction(root, 1);
+ /*
+ * make sure the inode gets flushed if it is replacing
+ * something.
+ */
+ if (new_inode && new_inode->i_size &&
+ old_inode && S_ISREG(old_inode->i_mode)) {
+ btrfs_add_ordered_operation(trans, root, old_inode);
+ }
+
+ /*
+ * this is an ugly little race, but the rename is required to make
+ * sure that if we crash, the inode is either at the old name
+ * or the new one. pinning the log transaction lets us make sure
+ * we don't allow a log commit to come in after we unlink the
+ * name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+
btrfs_set_trans_block_group(trans, new_dir);
btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4818,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -4645,7 +4852,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_fail;
+ btrfs_log_new_name(trans, old_inode, old_dir,
+ new_dentry->d_parent);
out_fail:
+
+ /* this btrfs_end_log_trans just allows the current
+ * log-sub transaction to complete
+ */
+ btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root);
out_unlock:
return ret;
@@ -4813,10 +5027,10 @@ out_fail:
return err;
}
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
- u64 alloc_hint, int mode)
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 start, u64 end,
+ u64 locked_end, u64 alloc_hint, int mode)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 alloc_size;
@@ -4824,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
u64 num_bytes = end - start;
int ret = 0;
- trans = btrfs_join_transaction(root, 1);
- BUG_ON(!trans);
- btrfs_set_trans_block_group(trans, inode);
-
while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -4840,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
ret = insert_reserved_file_extent(trans, inode,
cur_offset, ins.objectid,
ins.offset, ins.offset,
- ins.offset, 0, 0, 0,
+ ins.offset, locked_end,
+ 0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
BUG_ON(ret);
num_bytes -= ins.offset;
@@ -4858,7 +5069,6 @@ out:
BUG_ON(ret);
}
- btrfs_end_transaction(trans, root);
return ret;
}
@@ -4870,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
+ u64 locked_end;
u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
+ struct btrfs_trans_handle *trans;
int ret;
alloc_start = offset & ~mask;
alloc_end = (offset + len + mask) & ~mask;
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
mutex_lock(&inode->i_mutex);
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, alloc_start);
@@ -4884,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode,
goto out;
}
+ locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
- alloc_end - 1, GFP_NOFS);
+
+ trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+ if (!trans) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /* the extent lock is ordered inside the running
+ * transaction
+ */
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ GFP_NOFS);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -4895,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent(&BTRFS_I(inode)->io_tree,
- alloc_start, alloc_end - 1, GFP_NOFS);
+ alloc_start, locked_end, GFP_NOFS);
+ btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+
+ /*
+ * we can't wait on the range with the transaction
+ * running or with the extent lock held
+ */
btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
} else {
@@ -4913,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode,
last_byte = min(extent_map_end(em), alloc_end);
last_byte = (last_byte + mask) & ~mask;
if (em->block_start == EXTENT_MAP_HOLE) {
- ret = prealloc_file_range(inode, cur_offset,
- last_byte, alloc_hint, mode);
+ ret = prealloc_file_range(trans, inode, cur_offset,
+ last_byte, locked_end + 1,
+ alloc_hint, mode);
if (ret < 0) {
free_extent_map(em);
break;
@@ -4930,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
break;
}
}
- unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
GFP_NOFS);
+
+ btrfs_end_transaction(trans, BTRFS_I(inode)->root);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c8..5e94ea6e1cbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
goto out_dput;
if (!IS_POSIXACL(parent->dentry->d_inode))
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
error = mnt_want_write(parent->mnt);
if (error)
@@ -461,15 +461,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
- if (!vol_args)
- return -ENOMEM;
-
- if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
- ret = -EFAULT;
- goto out;
- }
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
namelen = strlen(vol_args->name);
@@ -483,11 +477,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
*devstr = '\0';
devstr = vol_args->name;
devid = simple_strtoull(devstr, &end, 10);
- printk(KERN_INFO "resizing devid %llu\n", devid);
+ printk(KERN_INFO "resizing devid %llu\n",
+ (unsigned long long)devid);
}
device = btrfs_find_device(root, devid, NULL, NULL);
if (!device) {
- printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+ printk(KERN_INFO "resizer unable to find device %llu\n",
+ (unsigned long long)devid);
ret = -EINVAL;
goto out_unlock;
}
@@ -545,7 +541,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
out_unlock:
mutex_unlock(&root->fs_info->volume_mutex);
-out:
kfree(vol_args);
return ret;
}
@@ -565,15 +560,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
- vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
- if (!vol_args)
- return -ENOMEM;
-
- if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
- ret = -EFAULT;
- goto out;
- }
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
namelen = strlen(vol_args->name);
@@ -675,19 +664,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
- if (!vol_args)
- return -ENOMEM;
-
- if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
- ret = -EFAULT;
- goto out;
- }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
-out:
kfree(vol_args);
return ret;
}
@@ -703,19 +686,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
- vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
- if (!vol_args)
- return -ENOMEM;
-
- if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
- ret = -EFAULT;
- goto out;
- }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_rm_device(root, vol_args->name);
-out:
kfree(vol_args);
return ret;
}
@@ -830,7 +807,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
BUG_ON(!trans);
/* punch hole in destination first */
- btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+ btrfs_drop_extents(trans, root, inode, off, off + len,
+ off + len, 0, &hint_byte);
/* clone data */
key.objectid = src->i_ino;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
/*
* unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for every long, and often they don't block
- * at all. For a dbench 50 run, if we don't spin one the blocking bit
+ * don't end up blocking for very long, and often they don't block
+ * at all. For a dbench 50 run, if we don't spin on the blocking bit
* at all, the context switch rate can jump up to 400,000/sec or more.
*
* So, we're still stuck with this crummy spin on the blocking bit,
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
static int btrfs_spin_on_block(struct extent_buffer *eb)
{
int i;
+
for (i = 0; i < 512; i++) {
- cpu_relax();
if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
return 1;
if (need_resched())
break;
+ cpu_relax();
}
return 0;
}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
{
int i;
- spin_nested(eb);
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- return 1;
- spin_unlock(&eb->lock);
-
+ if (btrfs_spin_on_block(eb)) {
+ spin_nested(eb);
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ spin_unlock(&eb->lock);
+ }
/* spin for a bit on the BLOCKING flag */
for (i = 0; i < 2; i++) {
+ cpu_relax();
if (!btrfs_spin_on_block(eb))
break;
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
DEFINE_WAIT(wait);
wait.func = btrfs_wake_function;
+ if (!btrfs_spin_on_block(eb))
+ goto sleep;
+
while(1) {
spin_nested(eb);
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
* spin for a bit, and if the blocking flag goes away,
* loop around
*/
+ cpu_relax();
if (btrfs_spin_on_block(eb))
continue;
-
+sleep:
prepare_to_wait_exclusive(&eb->lock_wq, &wait,
TASK_UNINTERRUPTIBLE);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..d6f0806c682f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
+
+ /*
+ * we have no more ordered extents for this inode and
+ * no dirty pages. We can safely remove it from the
+ * list of ordered extents
+ */
+ if (RB_EMPTY_ROOT(&tree->tree) &&
+ !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ }
spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
}
/*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list. These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io. When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct inode *inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->fs_info->ordered_operations_mutex);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+ list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ ordered_operations);
+
+ inode = &btrfs_inode->vfs_inode;
+
+ list_del_init(&btrfs_inode->ordered_operations);
+
+ /*
+ * the inode may be getting freed (in sys_unlink path).
+ */
+ inode = igrab(inode);
+
+ if (!wait && inode) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (inode) {
+ if (wait)
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ else
+ filemap_flush(inode->i_mapping);
+ iput(inode);
+ }
+
+ cond_resched();
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+ if (wait && !list_empty(&root->fs_info->ordered_operations))
+ goto again;
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+ return 0;
+}
+
+/*
* Used to start IO or wait for a given ordered extent to finish.
*
* If wait is one, this effectively waits on page writeback for all the pages
@@ -417,7 +489,7 @@ again:
/* start IO across the range first to instantiate any delalloc
* extents
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+ btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
/* The compression code will leave pages locked but return from
* writepage without setting the page writeback. Starting again
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
return ret;
}
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ u64 last_mod;
+
+ last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+ /*
+ * if this file hasn't been changed since the last transaction
+ * commit, we can safely return without doing anything
+ */
+ if (last_mod < root->fs_info->last_trans_committed)
+ return 0;
+
+ /*
+ * the transaction is already committing. Just start the IO and
+ * don't bother with all of this list nonsense
+ */
+ if (trans && root->fs_info->running_transaction->blocked) {
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ return 0;
+ }
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..3536bdb2d7cb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
+#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
enum {
Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
- Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
+ Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
+ Opt_ratio, Opt_flushoncommit, Opt_err,
};
static match_table_t tokens = {
@@ -83,6 +85,9 @@ static match_table_t tokens = {
{Opt_compress, "compress"},
{Opt_ssd, "ssd"},
{Opt_noacl, "noacl"},
+ {Opt_notreelog, "notreelog"},
+ {Opt_flushoncommit, "flushoncommit"},
+ {Opt_ratio, "metadata_ratio=%d"},
{Opt_err, NULL},
};
@@ -191,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->max_extent = max_t(u64,
info->max_extent, root->sectorsize);
printk(KERN_INFO "btrfs: max_extent at %llu\n",
- info->max_extent);
+ (unsigned long long)info->max_extent);
}
break;
case Opt_max_inline:
@@ -206,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
root->sectorsize);
}
printk(KERN_INFO "btrfs: max_inline at %llu\n",
- info->max_inline);
+ (unsigned long long)info->max_inline);
}
break;
case Opt_alloc_start:
@@ -216,12 +221,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
kfree(num);
printk(KERN_INFO
"btrfs: allocations start at %llu\n",
- info->alloc_start);
+ (unsigned long long)info->alloc_start);
}
break;
case Opt_noacl:
root->fs_info->sb->s_flags &= ~MS_POSIXACL;
break;
+ case Opt_notreelog:
+ printk(KERN_INFO "btrfs: disabling tree log\n");
+ btrfs_set_opt(info->mount_opt, NOTREELOG);
+ break;
+ case Opt_flushoncommit:
+ printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+ btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+ break;
+ case Opt_ratio:
+ intarg = 0;
+ match_int(&args[0], &intarg);
+ if (intarg) {
+ info->metadata_ratio = intarg;
+ printk(KERN_INFO "btrfs: metadata ratio %d\n",
+ info->metadata_ratio);
+ }
+ break;
default:
break;
}
@@ -363,9 +385,8 @@ fail_close:
int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
+ struct btrfs_root *root = btrfs_sb(sb);
int ret;
- root = btrfs_sb(sb);
if (sb->s_flags & MS_RDONLY)
return 0;
@@ -385,6 +406,44 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return ret;
}
+static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+ struct btrfs_fs_info *info = root->fs_info;
+
+ if (btrfs_test_opt(root, DEGRADED))
+ seq_puts(seq, ",degraded");
+ if (btrfs_test_opt(root, NODATASUM))
+ seq_puts(seq, ",nodatasum");
+ if (btrfs_test_opt(root, NODATACOW))
+ seq_puts(seq, ",nodatacow");
+ if (btrfs_test_opt(root, NOBARRIER))
+ seq_puts(seq, ",nobarrier");
+ if (info->max_extent != (u64)-1)
+ seq_printf(seq, ",max_extent=%llu",
+ (unsigned long long)info->max_extent);
+ if (info->max_inline != 8192 * 1024)
+ seq_printf(seq, ",max_inline=%llu",
+ (unsigned long long)info->max_inline);
+ if (info->alloc_start != 0)
+ seq_printf(seq, ",alloc_start=%llu",
+ (unsigned long long)info->alloc_start);
+ if (info->thread_pool_size != min_t(unsigned long,
+ num_online_cpus() + 2, 8))
+ seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+ if (btrfs_test_opt(root, COMPRESS))
+ seq_puts(seq, ",compress");
+ if (btrfs_test_opt(root, SSD))
+ seq_puts(seq, ",ssd");
+ if (btrfs_test_opt(root, NOTREELOG))
+ seq_puts(seq, ",no-treelog");
+ if (btrfs_test_opt(root, FLUSHONCOMMIT))
+ seq_puts(seq, ",flush-on-commit");
+ if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+ seq_puts(seq, ",noacl");
+ return 0;
+}
+
static void btrfs_write_super(struct super_block *sb)
{
sb->s_dirt = 0;
@@ -589,14 +648,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- vol = kmalloc(sizeof(*vol), GFP_KERNEL);
- if (!vol)
- return -ENOMEM;
-
- if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
- ret = -EFAULT;
- goto out;
- }
+ vol = memdup_user((void __user *)arg, sizeof(*vol));
+ if (IS_ERR(vol))
+ return PTR_ERR(vol);
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
@@ -604,7 +658,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
&btrfs_fs_type, &fs_devices);
break;
}
-out:
+
kfree(vol);
return ret;
}
@@ -630,7 +684,7 @@ static struct super_operations btrfs_super_ops = {
.put_super = btrfs_put_super,
.write_super = btrfs_write_super,
.sync_fs = btrfs_sync_fs,
- .show_options = generic_show_options,
+ .show_options = btrfs_show_options,
.write_inode = btrfs_write_inode,
.dirty_inode = btrfs_dirty_inode,
.alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4d..01b143605ec1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
GFP_NOFS);
BUG_ON(!cur_trans);
root->fs_info->generation++;
- root->fs_info->last_alloc = 0;
- root->fs_info->last_data_alloc = 0;
cur_trans->num_writers = 1;
cur_trans->num_joined = 0;
cur_trans->transid = root->fs_info->generation;
@@ -65,6 +63,15 @@ static noinline int join_transaction(struct btrfs_root *root)
cur_trans->use_count = 1;
cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
+
+ cur_trans->delayed_refs.root.rb_node = NULL;
+ cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.num_heads = 0;
+ cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +189,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->block_group = 0;
h->alloc_exclude_nr = 0;
h->alloc_exclude_start = 0;
+ h->delayed_ref_updates = 0;
+
root->fs_info->running_transaction->use_count++;
mutex_unlock(&root->fs_info->trans_mutex);
return h;
@@ -271,7 +280,6 @@ void btrfs_throttle(struct btrfs_root *root)
if (!root->fs_info->open_ioctl_trans)
wait_current_trans(root);
mutex_unlock(&root->fs_info->trans_mutex);
-
throttle_on_drops(root);
}
@@ -280,6 +288,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *info = root->fs_info;
+ int count = 0;
+
+ while (count < 4) {
+ unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (cur &&
+ trans->transaction->delayed_refs.num_heads_ready > 64) {
+ trans->delayed_ref_updates = 0;
+
+ /*
+ * do a full flush if the transaction is trying
+ * to close
+ */
+ if (trans->transaction->delayed_refs.flushing)
+ cur = 0;
+ btrfs_run_delayed_refs(trans, root, cur);
+ } else {
+ break;
+ }
+ count++;
+ }
mutex_lock(&info->trans_mutex);
cur_trans = info->running_transaction;
@@ -424,9 +453,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
u64 old_root_bytenr;
struct btrfs_root *tree_root = root->fs_info->tree_root;
- btrfs_extent_post_op(trans, root);
btrfs_write_dirty_block_groups(trans, root);
- btrfs_extent_post_op(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +468,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
btrfs_header_level(root->node));
btrfs_set_root_generation(&root->root_item, trans->transid);
- btrfs_extent_post_op(trans, root);
-
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
&root->root_item);
BUG_ON(ret);
btrfs_write_dirty_block_groups(trans, root);
- btrfs_extent_post_op(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
}
return 0;
}
@@ -459,15 +489,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *next;
struct extent_buffer *eb;
+ int ret;
- btrfs_extent_post_op(trans, fs_info->tree_root);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
eb = btrfs_lock_root_node(fs_info->tree_root);
- btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+ btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
- btrfs_extent_post_op(trans, fs_info->tree_root);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +508,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
root = list_entry(next, struct btrfs_root, dirty_list);
update_cowonly_root(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
}
return 0;
}
@@ -635,6 +671,37 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
}
/*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+ DEFINE_WAIT(wait);
+
+ mutex_lock(&info->trans_mutex);
+ while (info->running_transaction &&
+ info->running_transaction->delayed_refs.flushing) {
+ prepare_to_wait(&info->transaction_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&info->trans_mutex);
+
+ atomic_dec(&info->throttles);
+ wake_up(&info->transaction_throttle);
+
+ schedule();
+
+ atomic_inc(&info->throttles);
+ mutex_lock(&info->trans_mutex);
+ finish_wait(&info->transaction_wait, &wait);
+ }
+ mutex_unlock(&info->trans_mutex);
+ return 0;
+}
+
+/*
* Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
* all of them
*/
@@ -661,7 +728,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
atomic_inc(&root->fs_info->throttles);
while (1) {
+ /*
+ * we don't want to jump in and create a bunch of
+ * delayed refs if the transaction is starting to close
+ */
+ wait_transaction_pre_flush(tree_root->fs_info);
trans = btrfs_start_transaction(tree_root, 1);
+
+ /*
+ * we've joined a transaction, make sure it isn't
+ * closing right now
+ */
+ if (trans->transaction->delayed_refs.flushing) {
+ btrfs_end_transaction(trans, tree_root);
+ continue;
+ }
+
mutex_lock(&root->fs_info->drop_mutex);
ret = btrfs_drop_snapshot(trans, dirty->root);
if (ret != -EAGAIN)
@@ -766,7 +848,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);
- btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+ btrfs_cow_block(trans, root, old, NULL, 0, &old);
btrfs_copy_root(trans, root, old, &tmp, objectid);
btrfs_tree_unlock(old);
@@ -894,12 +976,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct extent_io_tree *pinned_copy;
DEFINE_WAIT(wait);
int ret;
+ int should_grow = 0;
+ unsigned long now = get_seconds();
+ int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+
+ btrfs_run_ordered_operations(root, 0);
+
+ /* make a pass through all the delayed refs we have so far
+ * any runnings procs may add more while we are here
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
+
+ cur_trans = trans->transaction;
+ /*
+ * set the flushing flag so procs in this transaction have to
+ * start sending their work down.
+ */
+ cur_trans->delayed_refs.flushing = 1;
+
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
- INIT_LIST_HEAD(&dirty_fs_roots);
mutex_lock(&root->fs_info->trans_mutex);
- if (trans->transaction->in_commit) {
- cur_trans = trans->transaction;
- trans->transaction->use_count++;
+ INIT_LIST_HEAD(&dirty_fs_roots);
+ if (cur_trans->in_commit) {
+ cur_trans->use_count++;
mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
@@ -922,7 +1024,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
- cur_trans = trans->transaction;
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
@@ -937,6 +1038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
}
+ if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+ should_grow = 1;
+
do {
int snap_pending = 0;
joined = cur_trans->num_joined;
@@ -949,26 +1053,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (cur_trans->num_writers > 1)
timeout = MAX_SCHEDULE_TIMEOUT;
- else
+ else if (should_grow)
timeout = 1;
mutex_unlock(&root->fs_info->trans_mutex);
- if (snap_pending) {
+ if (flush_on_commit || snap_pending) {
+ if (flush_on_commit)
+ btrfs_start_delalloc_inodes(root);
ret = btrfs_wait_ordered_extents(root, 1);
BUG_ON(ret);
}
- schedule_timeout(timeout);
+ /*
+ * rename don't use btrfs_join_transaction, so, once we
+ * set the transaction to blocked above, we aren't going
+ * to get any new ordered operations. We can safely run
+ * it here and no for sure that nothing new will be added
+ * to the list
+ */
+ btrfs_run_ordered_operations(root, 1);
+
+ smp_mb();
+ if (cur_trans->num_writers > 1 || should_grow)
+ schedule_timeout(timeout);
mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&cur_trans->writer_wait, &wait);
} while (cur_trans->num_writers > 1 ||
- (cur_trans->num_joined != joined));
+ (should_grow && cur_trans->num_joined != joined));
ret = create_pending_snapshots(trans, root->fs_info);
BUG_ON(ret);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
+
WARN_ON(cur_trans != trans->transaction);
/* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1152,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_copy_pinned(root, pinned_copy);
trans->transaction->blocked = 0;
+
wake_up(&root->fs_info->transaction_throttle);
wake_up(&root->fs_info->transaction_wait);
@@ -1058,6 +1179,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->trans_mutex);
cur_trans->commit_done = 1;
+
root->fs_info->last_trans_committed = cur_trans->transid;
wake_up(&cur_trans->commit_wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
#ifndef __BTRFS_TRANSACTION__
#define __BTRFS_TRANSACTION__
#include "btrfs_inode.h"
+#include "delayed-ref.h"
struct btrfs_transaction {
u64 transid;
+ /*
+ * total writers in this transaction, it must be zero before the
+ * transaction can end
+ */
unsigned long num_writers;
+
unsigned long num_joined;
int in_commit;
int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
+ struct btrfs_delayed_ref_root delayed_refs;
};
struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
u64 block_group;
u64 alloc_exclude_start;
u64 alloc_exclude_nr;
+ unsigned long delayed_ref_updates;
};
struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
}
btrfs_release_path(root, path);
- if (is_extent)
- btrfs_extent_post_op(trans, root);
out:
if (path)
btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..db5e212e8445 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
#define LOG_INODE_EXISTS 1
/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2). After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ * 2a is actually the more important variant. With the extra logging
+ * a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir. After a crash the rm -rf must
+ * be replayed. This must be able to recurse down the entire
+ * directory tree. The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
* stages for the tree walking. The first
* stage (0) is to only pin down the blocks we find
* the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
#define LOG_WALK_REPLAY_INODES 1
#define LOG_WALK_REPLAY_ALL 2
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all);
/*
* tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
}
/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ mutex_lock(&root->log_mutex);
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
smp_mb();
@@ -199,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
struct walk_control *wc, u64 gen)
{
- if (wc->pin) {
- mutex_lock(&log->fs_info->pinned_mutex);
+ if (wc->pin)
btrfs_update_pinned_extents(log->fs_info->extent_root,
eb->start, eb->len, 1);
- mutex_unlock(&log->fs_info->pinned_mutex);
- }
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
@@ -476,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
ret = btrfs_drop_extents(trans, root, inode,
- start, extent_end, start, &alloc_hint);
+ start, extent_end, extent_end, start, &alloc_hint);
BUG_ON(ret);
if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -603,6 +663,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
ret = link_to_fixup_dir(trans, root, path, location.objectid);
BUG_ON(ret);
+
ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
BUG_ON(ret);
kfree(name);
@@ -804,6 +865,7 @@ conflict_again:
victim_name_len)) {
btrfs_inc_nlink(inode);
btrfs_release_path(root, path);
+
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
@@ -922,13 +984,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
key.offset--;
btrfs_release_path(root, path);
}
- btrfs_free_path(path);
+ btrfs_release_path(root, path);
if (nlink != inode->i_nlink) {
inode->i_nlink = nlink;
btrfs_update_inode(trans, root, inode);
}
BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path,
+ inode->i_ino, 1);
+ BUG_ON(ret);
+ }
+ btrfs_free_path(path);
+
return 0;
}
@@ -971,9 +1040,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
iput(inode);
- if (key.offset == 0)
- break;
- key.offset--;
+ /*
+ * fixup on a directory may create new entries,
+ * make sure we always look for the highset possible
+ * offset
+ */
+ key.offset = (u64)-1;
}
btrfs_release_path(root, path);
return 0;
@@ -1150,8 +1222,7 @@ insert:
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
- if (ret && ret != -ENOENT)
- BUG();
+ BUG_ON(ret && ret != -ENOENT);
goto out;
}
@@ -1313,11 +1384,11 @@ again:
read_extent_buffer(eb, name, (unsigned long)(di + 1),
name_len);
log_di = NULL;
- if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+ if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
log_di = btrfs_lookup_dir_item(trans, log, log_path,
dir_key->objectid,
name, name_len, 0);
- } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+ } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
log_di = btrfs_lookup_dir_index_item(trans, log,
log_path,
dir_key->objectid,
@@ -1378,7 +1449,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 dirid)
+ u64 dirid, int del_all)
{
u64 range_start;
u64 range_end;
@@ -1408,10 +1479,14 @@ again:
range_start = 0;
range_end = 0;
while (1) {
- ret = find_dir_range(log, path, dirid, key_type,
- &range_start, &range_end);
- if (ret != 0)
- break;
+ if (del_all)
+ range_end = (u64)-1;
+ else {
+ ret = find_dir_range(log, path, dirid, key_type,
+ &range_start, &range_end);
+ if (ret != 0)
+ break;
+ }
dir_key.offset = range_start;
while (1) {
@@ -1437,7 +1512,8 @@ again:
break;
ret = check_item_in_log(trans, root, log, path,
- log_path, dir, &found_key);
+ log_path, dir,
+ &found_key);
BUG_ON(ret);
if (found_key.offset == (u64)-1)
break;
@@ -1514,7 +1590,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid);
+ root, log, path, key.objectid, 0);
BUG_ON(ret);
}
ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1609,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root, inode, inode->i_size,
BTRFS_EXTENT_DATA_KEY);
BUG_ON(ret);
+
+ /* if the nlink count is zero here, the iput
+ * will free the inode. We bump it to make
+ * sure it doesn't get freed until the link
+ * count fixup is done
+ */
+ if (inode->i_nlink == 0) {
+ btrfs_inc_nlink(inode);
+ btrfs_update_inode(wc->trans,
+ root, inode);
+ }
iput(inode);
}
ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1927,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -1854,9 +1942,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
prepare_to_wait(&root->log_commit_wait[index],
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->log_transid < transid + 2 &&
+
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && root->log_transid < transid + 2 &&
atomic_read(&root->log_commit[index]))
schedule();
+
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
} while (root->log_transid < transid + 2 &&
@@ -1864,14 +1955,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
return 0;
}
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
DEFINE_WAIT(wait);
while (atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (atomic_read(&root->log_writers))
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && atomic_read(&root->log_writers))
schedule();
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1975,14 @@ static int wait_for_writer(struct btrfs_root *root)
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -1896,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(root, root->log_transid);
+ wait_log_commit(trans, root, root->log_transid);
mutex_unlock(&root->log_mutex);
return 0;
}
@@ -1904,18 +2004,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(root, root->log_transid - 1);
+ wait_log_commit(trans, root, root->log_transid - 1);
while (1) {
unsigned long batch = root->log_batch;
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
- wait_for_writer(root);
+
+ wait_for_writer(trans, root);
if (batch == root->log_batch)
break;
}
+ /* bail out if we need to do a full commit */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ ret = -EAGAIN;
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
BUG_ON(ret);
@@ -1951,16 +2059,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
- wait_log_commit(log_root_tree, log_root_tree->log_transid);
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
atomic_set(&log_root_tree->log_commit[index2], 1);
- if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
- wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+ if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid - 1);
+ }
+
+ wait_for_writer(trans, log_root_tree);
- wait_for_writer(log_root_tree);
+ /*
+ * now that we've moved on to the tree of log tree roots,
+ * check the full commit flag again
+ */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out_wake_log_root;
+ }
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages);
@@ -1985,7 +2106,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* in and cause problems either.
*/
write_ctree_super(trans, root->fs_info->tree_root, 2);
+ ret = 0;
+out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2121,8 @@ out:
return 0;
}
-/* * free all the extents used by the tree log. This should be called
+/*
+ * free all the extents used by the tree log. This should be called
* at commit time of the full transaction
*/
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2256,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
mutex_unlock(&BTRFS_I(dir)->log_mutex);
- end_log_trans(root);
+ btrfs_end_log_trans(root);
return 0;
}
@@ -2159,7 +2283,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
- end_log_trans(root);
+ btrfs_end_log_trans(root);
return ret;
}
@@ -2559,7 +2683,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
*
* This handles both files and directories.
*/
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only)
{
@@ -2585,28 +2709,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
min_key.offset = 0;
max_key.objectid = inode->i_ino;
+
+ /* today the code can only do partial logging of directories */
+ if (!S_ISDIR(inode->i_mode))
+ inode_only = LOG_INODE_ALL;
+
if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /*
- * if this inode has already been logged and we're in inode_only
- * mode, we don't want to delete the things that have already
- * been written to the log.
- *
- * But, if the inode has been through an inode_only log,
- * the logged_trans field is not set. This allows us to catch
- * any new names for this inode in the backrefs by logging it
- * again
- */
- if (inode_only == LOG_INODE_EXISTS &&
- BTRFS_I(inode)->logged_trans == trans->transid) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- goto out;
- }
mutex_lock(&BTRFS_I(inode)->log_mutex);
/*
@@ -2693,7 +2806,6 @@ next_slot:
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
- BTRFS_I(inode)->log_dirty_trans = 0;
ret = log_directory_changes(trans, root, inode, path, dst_path);
BUG_ON(ret);
}
@@ -2702,19 +2814,69 @@ next_slot:
btrfs_free_path(path);
btrfs_free_path(dst_path);
-out:
return 0;
}
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only)
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged. Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct dentry *parent,
+ struct super_block *sb,
+ u64 last_committed)
{
- int ret;
+ int ret = 0;
+ struct btrfs_root *root;
- start_log_trans(trans, root);
- ret = __btrfs_log_inode(trans, root, inode, inode_only);
- end_log_trans(root);
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto out;
+
+ if (!S_ISDIR(inode->i_mode)) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ goto out;
+ inode = parent->d_inode;
+ }
+
+ while (1) {
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ smp_mb();
+
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ root = BTRFS_I(inode)->root;
+
+ /*
+ * make sure any commits to the log are forced
+ * to be full commits
+ */
+ root->fs_info->last_trans_log_full_commit =
+ trans->transid;
+ ret = 1;
+ break;
+ }
+
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ break;
+
+ if (parent == sb->s_root)
+ break;
+
+ parent = parent->d_parent;
+ inode = parent->d_inode;
+
+ }
+out:
return ret;
}
@@ -2724,31 +2886,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
* only logging is done of any parent directories that are older than
* the last committed transaction
*/
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only)
{
- int inode_only = LOG_INODE_ALL;
+ int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
- int ret;
+ int ret = 0;
+ u64 last_committed = root->fs_info->last_trans_committed;
+
+ sb = inode->i_sb;
+
+ if (btrfs_test_opt(root, NOTREELOG)) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ if (root->fs_info->last_trans_log_full_commit >
+ root->fs_info->last_trans_committed) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ ret = check_parent_dirs_for_sync(trans, inode, parent,
+ sb, last_committed);
+ if (ret)
+ goto end_no_trans;
start_log_trans(trans, root);
- sb = dentry->d_inode->i_sb;
- while (1) {
- ret = __btrfs_log_inode(trans, root, dentry->d_inode,
- inode_only);
- BUG_ON(ret);
- inode_only = LOG_INODE_EXISTS;
- dentry = dentry->d_parent;
- if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto no_parent;
+
+ inode_only = LOG_INODE_EXISTS;
+ while (1) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
- if (BTRFS_I(dentry->d_inode)->generation <=
- root->fs_info->last_trans_committed)
+ inode = parent->d_inode;
+ if (BTRFS_I(inode)->generation >
+ root->fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+ }
+ if (parent == sb->s_root)
break;
+
+ parent = parent->d_parent;
}
- end_log_trans(root);
- return 0;
+no_parent:
+ ret = 0;
+ btrfs_end_log_trans(root);
+end_no_trans:
+ return ret;
}
/*
@@ -2760,12 +2961,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry)
{
- u64 gen;
- gen = root->fs_info->last_trans_new_blockgroup;
- if (gen > root->fs_info->last_trans_committed)
- return 1;
- else
- return btrfs_log_dentry(trans, root, dentry);
+ return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+ dentry->d_parent, 0);
}
/*
@@ -2884,3 +3081,94 @@ again:
kfree(log_root_tree);
return 0;
}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename)
+{
+ /*
+ * when we're logging a file, if it hasn't been renamed
+ * or unlinked, and its inode is fully committed on disk,
+ * we don't have to worry about walking up the directory chain
+ * to log its parents.
+ *
+ * So, we use the last_unlink_trans field to put this transid
+ * into the file. When the file is logged we check it and
+ * don't log the parents if the file is fully on disk.
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this directory was already logged any new
+ * names for this file/dir will get recorded
+ */
+ smp_mb();
+ if (BTRFS_I(dir)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * if the inode we're about to unlink was logged,
+ * the log will be properly updated for any new names
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * when renaming files across directories, if the directory
+ * there we're unlinking from gets fsync'd later on, there's
+ * no way to find the destination directory later and fsync it
+ * properly. So, we have to be conservative and force commits
+ * so the new name gets discovered.
+ */
+ if (for_rename)
+ goto record;
+
+ /* we can safely do the unlink without any special recording */
+ return;
+
+record:
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent)
+{
+ struct btrfs_root * root = BTRFS_I(inode)->root;
+
+ /*
+ * this will force the logging code to walk the dentry chain
+ * up for the file
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this inode hasn't been logged and directory we're renaming it
+ * from hasn't been logged, we don't need to log it
+ */
+ if (BTRFS_I(inode)->logged_trans <=
+ root->fs_info->last_trans_committed &&
+ (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+ root->fs_info->last_trans_committed))
+ return 0;
+
+ return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..5f01dad4b696 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/random.h>
+#include <linux/iocontext.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
@@ -124,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
return NULL;
}
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+ struct bio *head, struct bio *tail)
+{
+
+ struct bio *old_head;
+
+ old_head = pending_bios->head;
+ pending_bios->head = head;
+ if (pending_bios->tail)
+ tail->bi_next = old_head;
+ else
+ pending_bios->tail = tail;
+}
+
/*
* we try to collect pending bios for a device so we don't get a large
* number of procs sending bios down to the same device. This greatly
@@ -140,31 +155,44 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
struct bio *pending;
struct backing_dev_info *bdi;
struct btrfs_fs_info *fs_info;
+ struct btrfs_pending_bios *pending_bios;
struct bio *tail;
struct bio *cur;
int again = 0;
- unsigned long num_run = 0;
+ unsigned long num_run;
+ unsigned long num_sync_run;
unsigned long limit;
+ unsigned long last_waited = 0;
- bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = blk_get_backing_dev_info(device->bdev);
fs_info = device->dev_root->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
+ /* we want to make sure that every time we switch from the sync
+ * list to the normal list, we unplug
+ */
+ num_sync_run = 0;
+
loop:
spin_lock(&device->io_lock);
+ num_run = 0;
loop_lock:
+
/* take all the bios off the list at once and process them
* later on (without the lock held). But, remember the
* tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion
*/
- pending = device->pending_bios;
- tail = device->pending_bio_tail;
+ if (device->pending_sync_bios.head)
+ pending_bios = &device->pending_sync_bios;
+ else
+ pending_bios = &device->pending_bios;
+
+ pending = pending_bios->head;
+ tail = pending_bios->tail;
WARN_ON(pending && !tail);
- device->pending_bios = NULL;
- device->pending_bio_tail = NULL;
/*
* if pending was null this time around, no bios need processing
@@ -174,16 +202,41 @@ loop_lock:
* device->running_pending is used to synchronize with the
* schedule_bio code.
*/
- if (pending) {
- again = 1;
- device->running_pending = 1;
- } else {
+ if (device->pending_sync_bios.head == NULL &&
+ device->pending_bios.head == NULL) {
again = 0;
device->running_pending = 0;
+ } else {
+ again = 1;
+ device->running_pending = 1;
}
+
+ pending_bios->head = NULL;
+ pending_bios->tail = NULL;
+
spin_unlock(&device->io_lock);
+ /*
+ * if we're doing the regular priority list, make sure we unplug
+ * for any high prio bios we've sent down
+ */
+ if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+ num_sync_run = 0;
+ blk_run_backing_dev(bdi, NULL);
+ }
+
while (pending) {
+
+ rmb();
+ if (pending_bios != &device->pending_sync_bios &&
+ device->pending_sync_bios.head &&
+ num_run > 16) {
+ cond_resched();
+ spin_lock(&device->io_lock);
+ requeue_list(pending_bios, pending, tail);
+ goto loop_lock;
+ }
+
cur = pending;
pending = pending->bi_next;
cur->bi_next = NULL;
@@ -194,10 +247,18 @@ loop_lock:
wake_up(&fs_info->async_submit_wait);
BUG_ON(atomic_read(&cur->bi_cnt) == 0);
- bio_get(cur);
submit_bio(cur->bi_rw, cur);
- bio_put(cur);
num_run++;
+ if (bio_sync(cur))
+ num_sync_run++;
+
+ if (need_resched()) {
+ if (num_sync_run) {
+ blk_run_backing_dev(bdi, NULL);
+ num_sync_run = 0;
+ }
+ cond_resched();
+ }
/*
* we made progress, there is more work to do and the bdi
@@ -206,17 +267,41 @@ loop_lock:
*/
if (pending && bdi_write_congested(bdi) && num_run > 16 &&
fs_info->fs_devices->open_devices > 1) {
- struct bio *old_head;
+ struct io_context *ioc;
- spin_lock(&device->io_lock);
-
- old_head = device->pending_bios;
- device->pending_bios = pending;
- if (device->pending_bio_tail)
- tail->bi_next = old_head;
- else
- device->pending_bio_tail = tail;
+ ioc = current->io_context;
+ /*
+ * the main goal here is that we don't want to
+ * block if we're going to be able to submit
+ * more requests without blocking.
+ *
+ * This code does two great things, it pokes into
+ * the elevator code from a filesystem _and_
+ * it makes assumptions about how batching works.
+ */
+ if (ioc && ioc->nr_batch_requests > 0 &&
+ time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+ (last_waited == 0 ||
+ ioc->last_waited == last_waited)) {
+ /*
+ * we want to go through our batch of
+ * requests and stop. So, we copy out
+ * the ioc->last_waited time and test
+ * against it before looping
+ */
+ last_waited = ioc->last_waited;
+ if (need_resched()) {
+ if (num_sync_run) {
+ blk_run_backing_dev(bdi, NULL);
+ num_sync_run = 0;
+ }
+ cond_resched();
+ }
+ continue;
+ }
+ spin_lock(&device->io_lock);
+ requeue_list(pending_bios, pending, tail);
device->running_pending = 1;
spin_unlock(&device->io_lock);
@@ -224,13 +309,32 @@ loop_lock:
goto done;
}
}
+
+ if (num_sync_run) {
+ num_sync_run = 0;
+ blk_run_backing_dev(bdi, NULL);
+ }
+
+ cond_resched();
if (again)
goto loop;
spin_lock(&device->io_lock);
- if (device->pending_bios)
+ if (device->pending_bios.head || device->pending_sync_bios.head)
goto loop_lock;
spin_unlock(&device->io_lock);
+
+ /*
+ * IO has already been through a long path to get here. Checksumming,
+ * async helper threads, perhaps compression. We've done a pretty
+ * good job of collecting a batch of IO and should just unplug
+ * the device right away.
+ *
+ * This will help anyone who is waiting on the IO, they might have
+ * already unplugged, but managed to do so before the bio they
+ * cared about found its way down here.
+ */
+ blk_run_backing_dev(bdi, NULL);
done:
return 0;
}
@@ -1439,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
- btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+ btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
btrfs_mark_buffer_dirty(leaf);
@@ -1836,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
device->total_bytes = new_size;
if (device->writeable)
device->fs_devices->total_rw_bytes -= diff;
- ret = btrfs_update_device(trans, device);
- if (ret) {
- unlock_chunks(root);
- btrfs_end_transaction(trans, root);
- goto done;
- }
- WARN_ON(diff > old_total);
- btrfs_set_super_total_bytes(super_copy, old_total - diff);
unlock_chunks(root);
btrfs_end_transaction(trans, root);
@@ -1875,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size)
- goto done;
+ break;
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -1888,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
goto done;
}
+ /* Shrinking succeeded, else we would be at "done". */
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ lock_chunks(root);
+
+ device->disk_total_bytes = new_size;
+ /* Now btrfs_update_device() will change the on-disk size. */
+ ret = btrfs_update_device(trans, device);
+ if (ret) {
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
+ goto done;
+ }
+ WARN_ON(diff > old_total);
+ btrfs_set_super_total_bytes(super_copy, old_total - diff);
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
return ret;
@@ -2458,7 +2574,7 @@ again:
max_errors = 1;
}
}
- if (multi_ret && rw == WRITE &&
+ if (multi_ret && (rw & (1 << BIO_RW)) &&
stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes;
free_extent_map(em);
@@ -2723,6 +2839,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
int rw, struct bio *bio)
{
int should_queue = 1;
+ struct btrfs_pending_bios *pending_bios;
/* don't bother with additional async steps for reads, right now */
if (!(rw & (1 << BIO_RW))) {
@@ -2744,13 +2861,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
+ if (bio_sync(bio))
+ pending_bios = &device->pending_sync_bios;
+ else
+ pending_bios = &device->pending_bios;
- if (device->pending_bio_tail)
- device->pending_bio_tail->bi_next = bio;
+ if (pending_bios->tail)
+ pending_bios->tail->bi_next = bio;
- device->pending_bio_tail = bio;
- if (!device->pending_bios)
- device->pending_bios = bio;
+ pending_bios->tail = bio;
+ if (!pending_bios->head)
+ pending_bios->head = bio;
if (device->running_pending)
should_queue = 0;
@@ -2967,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf,
unsigned long ptr;
device->devid = btrfs_device_id(leaf, dev_item);
- device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+ device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+ device->total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..5c3ff6d02fd7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
#include "async-thread.h"
struct buffer_head;
+struct btrfs_pending_bios {
+ struct bio *head;
+ struct bio *tail;
+};
+
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
struct btrfs_fs_devices *fs_devices;
struct btrfs_root *dev_root;
- struct bio *pending_bios;
- struct bio *pending_bio_tail;
+
+ /* regular prio bios */
+ struct btrfs_pending_bios pending_bios;
+ /* WRITE_SYNC bios */
+ struct btrfs_pending_bios pending_sync_bios;
+
int running_pending;
u64 generation;
@@ -52,6 +61,9 @@ struct btrfs_device {
/* size of the device */
u64 total_bytes;
+ /* size of the disk */
+ u64 disk_total_bytes;
+
/* bytes used */
u64 bytes_used;
@@ -76,7 +88,7 @@ struct btrfs_device {
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
- /* the device with this id has the most recent coyp of the super */
+ /* the device with this id has the most recent copy of the super */
u64 latest_devid;
u64 latest_trans;
u64 num_devices;