aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/delayed-ref.c34
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c56
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c223
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file.c36
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c83
-rw-r--r--fs/btrfs/ioctl.c14
-rw-r--r--fs/btrfs/qgroup.c62
-rw-r--r--fs/btrfs/qgroup.h36
-rw-r--r--fs/btrfs/relocation.c128
-rw-r--r--fs/btrfs/root-tree.c27
-rw-r--r--fs/btrfs/send.c181
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c107
-rw-r--r--fs/btrfs/tree-log.h5
-rw-r--r--fs/btrfs/volumes.c27
22 files changed, 798 insertions, 260 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2b88439c2ee8..455a6b2fd539 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode)
list_del(&ref2->list);
kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+ cond_resched();
}
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2fe8f89091a3..33fe03551105 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -427,6 +427,7 @@ struct btrfs_space_info {
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
+ u64 tickets_id;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -1028,6 +1029,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *qgroup_rescan_workers;
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
+ bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
/* filesystem state */
unsigned long fs_state;
@@ -1079,6 +1081,8 @@ struct btrfs_fs_info {
struct list_head pinned_chunks;
int creating_free_space_tree;
+ /* Used to record internally whether fs has been frozen */
+ int fs_frozen;
};
struct btrfs_subvolume_writers {
@@ -2578,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b6d210e7a993..ac02e041464b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_qgroup_extent_record *qexisting;
int count_mod = 1;
int must_insert_reserved = 0;
@@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
- qexisting = btrfs_qgroup_insert_dirty_extent(fs_info,
- delayed_refs,
- qrecord);
- if (qexisting)
+ if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info,
+ delayed_refs, qrecord))
kfree(qrecord);
}
@@ -862,33 +859,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- u64 ref_root, u64 bytenr, u64 num_bytes)
-{
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_head *ref_head;
- int ret = 0;
-
- if (!fs_info->quota_enabled || !is_fstree(ref_root))
- return 0;
-
- delayed_refs = &trans->transaction->delayed_refs;
-
- spin_lock(&delayed_refs->lock);
- ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
- if (!ref_head) {
- ret = -ENOENT;
- goto out;
- }
- WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
- ref_head->qgroup_ref_root = ref_root;
- ref_head->qgroup_reserved = num_bytes;
-out:
- spin_unlock(&delayed_refs->lock);
- return ret;
-}
-
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5fca9534a271..43f3629760e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 parent, u64 ref_root,
u64 owner, u64 offset, u64 reserved, int action,
struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- u64 ref_root, u64 bytenr, u64 num_bytes);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59febfb8d04a..54bc8c7c6bcd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root,
u32 nritems = btrfs_header_nritems(leaf);
int slot;
- if (nritems == 0)
+ if (nritems == 0) {
+ struct btrfs_root *check_root;
+
+ key.objectid = btrfs_header_owner(leaf);
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ check_root = btrfs_get_fs_root(root->fs_info, &key, false);
+ /*
+ * The only reason we also check NULL here is that during
+ * open_ctree() some roots has not yet been set up.
+ */
+ if (!IS_ERR_OR_NULL(check_root)) {
+ /* if leaf is the root, then it's fine */
+ if (leaf->start !=
+ btrfs_root_bytenr(&check_root->root_item)) {
+ CORRUPT("non-root leaf's nritems is 0",
+ leaf, root, 0);
+ return -EIO;
+ }
+ }
return 0;
+ }
/* Check the 0 item */
if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
@@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
+static int check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+ unsigned long nr = btrfs_header_nritems(node);
+
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+ btrfs_crit(root->fs_info,
+ "corrupt node: block %llu root %llu nritems %lu",
+ node->start, root->objectid, nr);
+ return -EIO;
+ }
+ return 0;
+}
+
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 phy_offset, struct page *page,
u64 start, u64 end, int mirror)
@@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
ret = -EIO;
}
+ if (found_level > 0 && check_node(root, eb))
+ ret = -EIO;
+
if (!ret)
set_extent_buffer_uptodate(eb);
err:
@@ -1618,8 +1655,8 @@ fail:
return ret;
}
-static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id)
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
{
struct btrfs_root *root;
@@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_rescan_running = false;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
+ fs_info->fs_frozen = 0;
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
@@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
btrfs_free_log(NULL, root);
+ if (root->reloc_root) {
+ free_extent_buffer(root->reloc_root->node);
+ free_extent_buffer(root->reloc_root->commit_root);
+ btrfs_put_fs_root(root->reloc_root);
+ root->reloc_root = NULL;
+ }
+ }
if (root->free_ino_pinned)
__btrfs_remove_free_space_cache(root->free_ino_pinned);
@@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root)
smp_mb();
/* wait for the qgroup rescan worker to stop */
- btrfs_qgroup_wait_for_completion(fs_info);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b3207a0e09f7..f19a982f5a4f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61b494e8e604..665da8f66ff1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,21 +60,6 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
-/*
- * Control how reservations are dealt with.
- *
- * RESERVE_FREE - freeing a reservation.
- * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
- * ENOSPC accounting
- * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
- * bytes_may_use as the ENOSPC accounting is done elsewhere
- */
-enum {
- RESERVE_FREE = 0,
- RESERVE_ALLOC = 1,
- RESERVE_ALLOC_NO_ACCOUNT = 2,
-};
-
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, int alloc);
@@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve,
- int delalloc);
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc);
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
@@ -3501,7 +3487,6 @@ again:
dcs = BTRFS_DC_SETUP;
else if (ret == -ENOSPC)
set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
- btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -4286,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
if (ret < 0)
return ret;
- /*
- * Use new btrfs_qgroup_reserve_data to reserve precious data space
- *
- * TODO: Find a good method to avoid reserve data space for NOCOW
- * range, but don't impact performance on quota disable case.
- */
+ /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, start, len);
+ if (ret)
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
return ret;
}
@@ -4472,6 +4454,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
}
}
+/*
+ * If force is CHUNK_ALLOC_FORCE:
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ * If force is NOT CHUNK_ALLOC_FORCE:
+ * - return 0 if it doesn't need to allocate a new chunk,
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ */
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags, int force)
{
@@ -4882,7 +4873,7 @@ static int flush_space(struct btrfs_root *root,
btrfs_get_alloc_profile(root, 0),
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
- if (ret == -ENOSPC)
+ if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
case COMMIT_TRANS:
@@ -4907,11 +4898,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
u64 expected;
u64 to_reclaim = 0;
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (can_overcommit(root, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL))
- return 0;
-
list_for_each_entry(ticket, &space_info->tickets, list)
to_reclaim += ticket->bytes;
list_for_each_entry(ticket, &space_info->priority_tickets, list)
@@ -4919,6 +4905,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
if (to_reclaim)
return to_reclaim;
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
+ if (can_overcommit(root, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL))
+ return 0;
+
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
@@ -4972,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head)
*/
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
- struct reserve_ticket *last_ticket = NULL;
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
int flush_state;
int commit_cycles = 0;
+ u64 last_tickets_id;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
@@ -4990,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
- last_ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
+ last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -5011,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
space_info);
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (last_ticket == ticket) {
+ if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
- last_ticket = ticket;
+ last_tickets_id = space_info->tickets_id;
flush_state = FLUSH_DELAYED_ITEMS_NR;
if (commit_cycles)
commit_cycles--;
@@ -5390,6 +5380,7 @@ again:
list_del_init(&ticket->list);
num_bytes -= ticket->bytes;
ticket->bytes = 0;
+ space_info->tickets_id++;
wake_up(&ticket->wait);
} else {
ticket->bytes -= num_bytes;
@@ -5432,6 +5423,7 @@ again:
num_bytes -= ticket->bytes;
space_info->bytes_may_use += ticket->bytes;
ticket->bytes = 0;
+ space_info->tickets_id++;
wake_up(&ticket->wait);
} else {
trace_btrfs_space_reservation(fs_info, "space_info",
@@ -6497,19 +6489,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
}
/**
- * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * btrfs_add_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
+ * @ram_bytes: The number of bytes of file content, and will be same to
+ * @num_bytes except for the compress path.
* @num_bytes: The number of bytes in question
- * @reserve: One of the reservation enums
* @delalloc: The blocks are allocated for the delalloc write
*
- * This is called by the allocator when it reserves space, or by somebody who is
- * freeing space that was never actually used on disk. For example if you
- * reserve some space for a new leaf in transaction A and before transaction A
- * commits you free that leaf, you call this with reserve set to 0 in order to
- * clear the reservation.
- *
- * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * This is called by the allocator when it reserves space. Metadata
+ * reservations should be called with RESERVE_ALLOC so we do the proper
* ENOSPC accounting. For data we handle the reservation through clearing the
* delalloc bits in the io_tree. We have to do this since we could end up
* allocating less disk space for the amount of data we have reserved in the
@@ -6519,44 +6507,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
* make the reservation and return -EAGAIN, otherwise this function always
* succeeds.
*/
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int delalloc)
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
- if (reserve != RESERVE_FREE) {
- if (cache->ro) {
- ret = -EAGAIN;
- } else {
- cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- if (reserve == RESERVE_ALLOC) {
- trace_btrfs_space_reservation(cache->fs_info,
- "space_info", space_info->flags,
- num_bytes, 0);
- space_info->bytes_may_use -= num_bytes;
- }
-
- if (delalloc)
- cache->delalloc_bytes += num_bytes;
- }
+ if (cache->ro) {
+ ret = -EAGAIN;
} else {
- if (cache->ro)
- space_info->bytes_readonly += num_bytes;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ trace_btrfs_space_reservation(cache->fs_info,
+ "space_info", space_info->flags,
+ ram_bytes, 0);
+ space_info->bytes_may_use -= ram_bytes;
if (delalloc)
- cache->delalloc_bytes -= num_bytes;
+ cache->delalloc_bytes += num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
return ret;
}
+/**
+ * btrfs_free_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @num_bytes: The number of bytes in question
+ * @delalloc: The blocks are allocated for the delalloc write
+ *
+ * This is called by somebody who is freeing space that was never actually used
+ * on disk. For example if you reserve some space for a new leaf in transaction
+ * A and before transaction A commits you free that leaf, you call this with
+ * reserve set to 0 in order to clear the reservation.
+ */
+
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -7191,7 +7198,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+ btrfs_free_reserved_bytes(cache, buf->len, 0);
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
@@ -7416,9 +7423,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
* the free space extent currently.
*/
static noinline int find_free_extent(struct btrfs_root *orig_root,
- u64 num_bytes, u64 empty_size,
- u64 hint_byte, struct btrfs_key *ins,
- u64 flags, int delalloc)
+ u64 ram_bytes, u64 num_bytes, u64 empty_size,
+ u64 hint_byte, struct btrfs_key *ins,
+ u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -7430,8 +7437,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
- int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
- RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7763,8 +7768,8 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
- ret = btrfs_update_reserved_bytes(block_group, num_bytes,
- alloc_type, delalloc);
+ ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
+ num_bytes, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -7936,7 +7941,7 @@ again:
up_read(&info->groups_sem);
}
-int btrfs_reserve_extent(struct btrfs_root *root,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
@@ -7948,8 +7953,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,
flags = btrfs_get_alloc_profile(root, is_data);
again:
WARN_ON(num_bytes < root->sectorsize);
- ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
- flags, delalloc);
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
+ hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(root->fs_info,
ins->objectid);
@@ -7958,6 +7963,7 @@ again:
num_bytes = min(num_bytes >> 1, ins->offset);
num_bytes = round_down(num_bytes, root->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
+ ram_bytes = num_bytes;
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
@@ -7995,7 +8001,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
if (btrfs_test_opt(root->fs_info, DISCARD))
ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+ btrfs_free_reserved_bytes(cache, len, delalloc);
trace_btrfs_reserved_extent_free(root, start, len);
}
@@ -8208,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
/*
* Mixed block groups will exclude before processing the log so we only
@@ -8223,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
if (!block_group)
return -EINVAL;
- ret = btrfs_update_reserved_bytes(block_group, ins->offset,
- RESERVE_ALLOC_NO_ACCOUNT, 0);
- BUG_ON(ret); /* logic error */
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ space_info->bytes_reserved += ins->offset;
+ block_group->reserved += ins->offset;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
btrfs_put_block_group(block_group);
@@ -8368,7 +8380,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
- ret = btrfs_reserve_extent(root, blocksize, blocksize,
+ ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
empty_size, hint, &ins, 0, 0);
if (ret)
goto out_unuse;
@@ -8521,35 +8533,6 @@ reada:
wc->reada_slot = slot;
}
-/*
- * These may not be seen by the usual inc/dec ref code so we have to
- * add them here.
- */
-static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes)
-{
- struct btrfs_qgroup_extent_record *qrecord;
- struct btrfs_delayed_ref_root *delayed_refs;
-
- qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
- if (!qrecord)
- return -ENOMEM;
-
- qrecord->bytenr = bytenr;
- qrecord->num_bytes = num_bytes;
- qrecord->old_roots = NULL;
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
- if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
- delayed_refs, qrecord))
- kfree(qrecord);
- spin_unlock(&delayed_refs->lock);
-
- return 0;
-}
-
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
@@ -8583,7 +8566,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
- ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+ bytenr, num_bytes, GFP_NOFS);
if (ret)
return ret;
}
@@ -8732,8 +8716,9 @@ walk_down:
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
- ret = record_one_subtree_extent(trans, root, child_bytenr,
- root->nodesize);
+ ret = btrfs_qgroup_insert_dirty_extent(trans,
+ root->fs_info, child_bytenr,
+ root->nodesize, GFP_NOFS);
if (ret)
goto out;
}
@@ -9906,6 +9891,7 @@ static int find_first_block_group(struct btrfs_root *root,
} else {
ret = 0;
}
+ free_extent_map(em);
goto out;
}
path->slots[0]++;
@@ -9942,6 +9928,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group->iref = 0;
block_group->inode = NULL;
spin_unlock(&block_group->lock);
+ ASSERT(block_group->io_ctl.inode == NULL);
iput(inode);
last = block_group->key.objectid + block_group->key.offset;
btrfs_put_block_group(block_group);
@@ -9999,6 +9986,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
free_excluded_extents(info->extent_root, block_group);
btrfs_remove_free_space_cache(block_group);
+ ASSERT(list_empty(&block_group->dirty_list));
+ ASSERT(list_empty(&block_group->io_list));
+ ASSERT(list_empty(&block_group->bg_list));
+ ASSERT(atomic_read(&block_group->count) == 1);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bc2729a7612d..28cd88fccc7e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,6 +20,7 @@
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
#define EXTENT_QGROUP_RESERVED (1U << 16)
+#define EXTENT_CLEAR_DATA_RESV (1U << 17)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9404121fd5f7..fea31a4a6e36 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * An ordered extent might have started before and completed
+ * already with io errors, in which case the inode was not
+ * updated and we end up here. So check the inode's mapping
+ * flags for any errors that might have happened while doing
+ * writeback of file data.
+ */
+ ret = btrfs_inode_check_errors(inode);
inode_unlock(inode);
goto out;
}
@@ -2062,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
trans->sync = true;
- btrfs_init_log_ctx(&ctx);
+ btrfs_init_log_ctx(&ctx, inode);
ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
if (ret < 0) {
@@ -2667,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start = round_down(offset, blocksize);
alloc_end = round_up(offset + len, blocksize);
+ cur_offset = alloc_start;
/* Make sure we aren't being give some crap mode */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2759,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode,
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
- cur_offset = alloc_start;
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
@@ -2786,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte - cur_offset);
if (ret < 0)
break;
+ } else {
+ /*
+ * Do not need to reserve unwritten extent for this
+ * range, free reserved data space first, otherwise
+ * it'll result in false ENOSPC error.
+ */
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -2803,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode,
range->start,
range->len, 1 << inode->i_blkbits,
offset + len, &alloc_hint);
+ else
+ btrfs_free_reserved_data_space(inode, range->start,
+ range->len);
list_del(&range->list);
kfree(range);
}
@@ -2837,18 +2856,11 @@ out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_KERNEL);
out:
- /*
- * As we waited the extent range, the data_rsv_map must be empty
- * in the range, as written data range will be released from it.
- * And for prealloacted extent, it will also be released when
- * its metadata is written.
- * So this is completely used as cleanup.
- */
- btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
inode_unlock(inode);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_start,
- alloc_end - alloc_start);
+ if (ret != 0)
+ btrfs_free_reserved_data_space(inode, alloc_start,
+ alloc_end - cur_offset);
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index aa6fabaee72e..359ee861b5a4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -495,10 +495,9 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_space(inode, 0, prealloc);
+ btrfs_delalloc_release_metadata(inode, prealloc);
goto out_put;
}
- btrfs_free_reserved_data_space(inode, 0, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2f5975954ccf..e6811c42e41e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -566,6 +566,8 @@ cont:
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
+ btrfs_free_reserved_data_space_noquota(inode, start,
+ end - start + 1);
goto free_pages_out;
}
}
@@ -742,7 +744,7 @@ retry:
lock_extent(io_tree, async_extent->start,
async_extent->start + async_extent->ram_size - 1);
- ret = btrfs_reserve_extent(root,
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
0, alloc_hint, &ins, 1, 1);
@@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,
EXTENT_DEFRAG, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
PAGE_END_WRITEBACK);
-
+ btrfs_free_reserved_data_space_noquota(inode, start,
+ end - start + 1);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
@@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,
unsigned long op;
cur_alloc_size = disk_num_bytes;
- ret = btrfs_reserve_extent(root, cur_alloc_size,
+ ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
@@ -1489,8 +1492,10 @@ out_check:
extent_clear_unlock_delalloc(inode, cur_offset,
cur_offset + num_bytes - 1,
locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC, PAGE_UNLOCK |
- PAGE_SET_PRIVATE2);
+ EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
cur_offset = extent_end;
@@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,
return;
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list && !(state->state & EXTENT_NORESERVE))
+ && do_list && !(state->state & EXTENT_NORESERVE)
+ && (*bits & (EXTENT_DO_ACCOUNTING |
+ EXTENT_CLEAR_DATA_RESV)))
btrfs_free_reserved_data_space_noquota(inode,
state->start, len);
@@ -3435,10 +3442,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
ret = PTR_ERR_OR_ZERO(inode);
- if (ret && ret != -ESTALE)
+ if (ret && ret != -ENOENT)
goto out;
- if (ret == -ESTALE && root == root->fs_info->tree_root) {
+ if (ret == -ENOENT && root == root->fs_info->tree_root) {
struct btrfs_root *dead_root;
struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
@@ -3474,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* Inode is already gone but the orphan item is still there,
* kill the orphan item.
*/
- if (ret == -ESTALE) {
+ if (ret == -ENOENT) {
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3633,7 +3640,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
/*
* read an inode from the btree into the in-memory inode
*/
-static void btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -3652,14 +3659,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
filled = true;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ ret = -ENOMEM;
goto make_bad;
+ }
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
- if (ret)
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
goto make_bad;
+ }
leaf = path->nodes[0];
@@ -3812,11 +3824,12 @@ cache_acl:
}
btrfs_update_iflags(inode);
- return;
+ return 0;
make_bad:
btrfs_free_path(path);
make_bad_inode(inode);
+ return ret;
}
/*
@@ -4204,6 +4217,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
int err = 0;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
+ u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
@@ -4226,11 +4240,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
+ last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
/* now the directory is empty */
err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
dentry->d_name.name, dentry->d_name.len);
- if (!err)
+ if (!err) {
btrfs_i_size_write(inode, 0);
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to
+ * its parent directory. This is to prevent an unrecoverable
+ * log tree in the case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ */
+ if (last_unlink_trans >= trans->transid)
+ BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+ }
out:
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
@@ -5606,7 +5636,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
- btrfs_read_locked_inode(inode);
+ int ret;
+
+ ret = btrfs_read_locked_inode(inode);
if (!is_bad_inode(inode)) {
inode_tree_add(inode);
unlock_new_inode(inode);
@@ -5615,7 +5647,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
} else {
unlock_new_inode(inode);
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ ASSERT(ret < 0);
+ inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
}
}
@@ -7225,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
- ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+ ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
@@ -7725,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
ret = PTR_ERR(em2);
goto unlock_err;
}
+ /*
+ * For inode marked NODATACOW or extent marked PREALLOC,
+ * use the existing or preallocated extent, so does not
+ * need to adjust btrfs_space_info's bytes_may_use.
+ */
+ btrfs_free_reserved_data_space_noquota(inode,
+ start, len);
goto unlock;
}
}
@@ -7759,7 +7799,6 @@ unlock:
i_size_write(inode, start + len);
adjust_dio_outstanding_extents(inode, dio_data, len);
- btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
dio_data->unsubmitted_oe_range_end = start + len;
@@ -10280,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
+ u64 end = start + num_bytes - 1;
if (trans)
own_trans = false;
@@ -10301,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
* sized chunks.
*/
cur_bytes = min(cur_bytes, last_alloc);
- ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
- *alloc_hint, &ins, 1, 0);
+ ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
+ min_size, 0, *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -10388,6 +10428,9 @@ next:
if (own_trans)
btrfs_end_transaction(trans, root);
}
+ if (cur_offset < end)
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ end - cur_offset + 1);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14ed1e9e6bc8..7fd939bfbd99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
int namelen;
int ret = 0;
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
ret = mnt_want_write_file(file);
if (ret)
goto out;
@@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
struct btrfs_ioctl_vol_args *vol_args;
int ret;
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
@@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
bool readonly = false;
struct btrfs_qgroup_inherit *inherit = NULL;
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
@@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
int ret;
int err = 0;
+ if (!S_ISDIR(dir->i_mode))
+ return -ENOTDIR;
+
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
@@ -5084,7 +5096,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- return btrfs_qgroup_wait_for_completion(root->fs_info);
+ return btrfs_qgroup_wait_for_completion(root->fs_info, true);
}
static long _btrfs_ioctl_set_received_subvol(struct file *file,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93ee1c18ef9d..8db2e29fdcf4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
goto out;
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
- btrfs_qgroup_wait_for_completion(fs_info);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
return ret;
}
-struct btrfs_qgroup_extent_record *
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record)
+int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
{
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
struct rb_node *parent_node = NULL;
@@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
else if (bytenr > entry->bytenr)
p = &(*p)->rb_right;
else
- return entry;
+ return 1;
}
rb_link_node(&record->node, parent_node, p);
rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
- return NULL;
+ return 0;
+}
+
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+ gfp_t gfp_flag)
+{
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0)
+ return 0;
+ if (WARN_ON(trans == NULL))
+ return -EINVAL;
+ record = kmalloc(sizeof(*record), gfp_flag);
+ if (!record)
+ return -ENOMEM;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ record->bytenr = bytenr;
+ record->num_bytes = num_bytes;
+ record->old_roots = NULL;
+
+ spin_lock(&delayed_refs->lock);
+ ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs,
+ record);
+ spin_unlock(&delayed_refs->lock);
+ if (ret > 0)
+ kfree(record);
+ return 0;
}
#define UPDATE_NEW 0
@@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
int err = -ENOMEM;
int ret = 0;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
path = btrfs_alloc_path();
if (!path)
goto out;
@@ -2369,6 +2402,9 @@ out:
}
done:
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = false;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
complete_all(&fs_info->qgroup_rescan_completion);
}
@@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+ bool interruptible)
{
int running;
int ret = 0;
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
- running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ running = fs_info->qgroup_rescan_running;
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- if (running)
+ if (!running)
+ return 0;
+
+ if (interruptible)
ret = wait_for_completion_interruptible(
&fs_info->qgroup_rescan_completion);
+ else
+ wait_for_completion(&fs_info->qgroup_rescan_completion);
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 710887c06aaf..1bc64c864b62 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+ bool interruptible);
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
@@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-struct btrfs_qgroup_extent_record *
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record);
+/*
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
+ * account that extent at commit trans time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocate memory.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
+int btrfs_qgroup_insert_dirty_extent_nolock(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
+
+/*
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
+ * account that extent at commit trans time.
+ *
+ * Better encapsulated version.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+ gfp_t gfp_flag);
+
int
btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b26a5aea41b4..c0c13dc6fe12 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -31,6 +31,7 @@
#include "async-thread.h"
#include "free-space-cache.h"
#include "inode-map.h"
+#include "qgroup.h"
/*
* backref_node, mapping_node and tree_block start with this
@@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode,
u64 num_bytes;
int nr = 0;
int ret = 0;
+ u64 prealloc_start = cluster->start - offset;
+ u64 prealloc_end = cluster->end - offset;
+ u64 cur_offset;
BUG_ON(cluster->start != cluster->boundary[0]);
inode_lock(inode);
- ret = btrfs_check_data_free_space(inode, cluster->start,
- cluster->end + 1 - cluster->start);
+ ret = btrfs_check_data_free_space(inode, prealloc_start,
+ prealloc_end + 1 - prealloc_start);
if (ret)
goto out;
+ cur_offset = prealloc_start;
while (nr < cluster->nr) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
@@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode,
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
num_bytes = end + 1 - start;
+ if (cur_offset < start)
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ start - cur_offset);
ret = btrfs_prealloc_file_range(inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
+ cur_offset = end + 1;
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
if (ret)
break;
nr++;
}
- btrfs_free_reserved_data_space(inode, cluster->start,
- cluster->end + 1 - cluster->start);
+ if (cur_offset < prealloc_end)
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ prealloc_end + 1 - cur_offset);
out:
inode_unlock(inode);
return ret;
@@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc)
return 0;
}
+/*
+ * Qgroup fixer for data chunk relocation.
+ * The data relocation is done in the following steps
+ * 1) Copy data extents into data reloc tree
+ * 2) Create tree reloc tree(special snapshot) for related subvolumes
+ * 3) Modify file extents in tree reloc tree
+ * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
+ *
+ * The problem is, data and tree reloc tree are not accounted to qgroup,
+ * and 4) will only info qgroup to track tree blocks change, not file extents
+ * in the tree blocks.
+ *
+ * The good news is, related data extents are all in data reloc tree, so we
+ * only need to info qgroup to track all file extents in data reloc tree
+ * before commit trans.
+ */
+static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct inode *inode = rc->data_inode;
+ struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ /*
+ * Only for stage where we update data pointers the qgroup fix is
+ * valid.
+ * For MOVING_DATA stage, we will miss the timing of swapping tree
+ * blocks, and won't fix it.
+ */
+ if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
+ while (1) {
+ struct btrfs_file_extent_item *fi;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid > btrfs_ino(inode))
+ break;
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto next;
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(path->nodes[0], fi) !=
+ BTRFS_FILE_EXTENT_REG)
+ goto next;
+ ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
+ btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
+ btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
+ GFP_NOFS);
+ if (ret < 0)
+ break;
+next:
+ ret = btrfs_next_item(data_reloc_root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
@@ -4102,10 +4196,18 @@ restart:
/* get rid of pinned extents */
trans = btrfs_join_transaction(rc->extent_root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- else
- btrfs_commit_transaction(trans, rc->extent_root);
+ goto out_free;
+ }
+ ret = qgroup_fix_relocated_data_extents(trans, rc);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ if (!err)
+ err = ret;
+ goto out_free;
+ }
+ btrfs_commit_transaction(trans, rc->extent_root);
out_free:
btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
btrfs_free_path(path);
@@ -4468,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
unset_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- else
- err = btrfs_commit_transaction(trans, rc->extent_root);
+ goto out_free;
+ }
+ err = qgroup_fix_relocated_data_extents(trans, rc);
+ if (err < 0) {
+ btrfs_abort_transaction(trans, err);
+ goto out_free;
+ }
+ err = btrfs_commit_transaction(trans, rc->extent_root);
out_free:
kfree(rc);
out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7fd7e1830cfe..091296062456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid = key.offset;
key.offset++;
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ root = btrfs_lookup_fs_root(tree_root->fs_info,
+ root_key.objectid);
+ if (root) {
+ WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ &root->state));
+ if (btrfs_root_refs(&root->root_item) == 0)
+ btrfs_add_dead_root(root);
+ continue;
+ }
+
root = btrfs_read_fs_root(tree_root, &root_key);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
@@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
- /*
- * The root might have been inserted already, as before we look
- * for orphan roots, log replay might have happened, which
- * triggers a transaction commit and qgroup accounting, which
- * in turn reads and inserts fs roots while doing backref
- * walking.
- */
- if (err == -EEXIST)
- err = 0;
if (err) {
+ BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b71dd298385c..a87675ffd02b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -231,7 +231,6 @@ struct pending_dir_move {
u64 parent_ino;
u64 ino;
u64 gen;
- bool is_orphan;
struct list_head update_refs;
};
@@ -274,6 +273,39 @@ struct name_cache_entry {
char name[];
};
+static void inconsistent_snapshot_error(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result,
+ const char *what)
+{
+ const char *result_string;
+
+ switch (result) {
+ case BTRFS_COMPARE_TREE_NEW:
+ result_string = "new";
+ break;
+ case BTRFS_COMPARE_TREE_DELETED:
+ result_string = "deleted";
+ break;
+ case BTRFS_COMPARE_TREE_CHANGED:
+ result_string = "updated";
+ break;
+ case BTRFS_COMPARE_TREE_SAME:
+ ASSERT(0);
+ result_string = "unchanged";
+ break;
+ default:
+ ASSERT(0);
+ result_string = "unexpected";
+ }
+
+ btrfs_err(sctx->send_root->fs_info,
+ "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
+ result_string, what, sctx->cmp_key->objectid,
+ sctx->send_root->root_key.objectid,
+ (sctx->parent_root ?
+ sctx->parent_root->root_key.objectid : 0));
+}
+
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
@@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
* was already unlinked/moved, so we can safely assume that we will not
* overwrite anything at this point in time.
*/
- if (other_inode > sctx->send_progress) {
+ if (other_inode > sctx->send_progress ||
+ is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, NULL,
who_gen, NULL, NULL, NULL, NULL);
if (ret < 0)
@@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret > 0)
+ ret = -ENOENT;
if (ret < 0)
goto out;
@@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
}
if (loc.objectid > send_progress) {
+ struct orphan_dir_info *odi;
+
+ odi = get_orphan_dir_info(sctx, dir);
+ free_orphan_dir_info(sctx, odi);
ret = 0;
goto out;
}
@@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx,
pm->parent_ino = parent_ino;
pm->ino = ino;
pm->gen = ino_gen;
- pm->is_orphan = is_orphan;
INIT_LIST_HEAD(&pm->list);
INIT_LIST_HEAD(&pm->update_refs);
RB_CLEAR_NODE(&pm->node);
@@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
return NULL;
}
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+ u64 ino, u64 gen, u64 *ancestor_ino)
+{
+ int ret = 0;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ u64 start_ino = ino;
+
+ *ancestor_ino = 0;
+ while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino))
+ break;
+ if (is_waiting_for_move(sctx, ino)) {
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret < 0)
+ break;
+ if (parent_inode == start_ino) {
+ ret = 1;
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ break;
+ }
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+ return ret;
+}
+
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
struct fs_path *from_path = NULL;
@@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
u64 parent_ino, parent_gen;
struct waiting_dir_move *dm = NULL;
u64 rmdir_ino = 0;
+ u64 ancestor;
+ bool is_orphan;
int ret;
name = fs_path_alloc();
@@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
dm = get_waiting_dir_move(sctx, pm->ino);
ASSERT(dm);
rmdir_ino = dm->rmdir_ino;
+ is_orphan = dm->orphanized;
free_waiting_dir_move(sctx, dm);
- if (pm->is_orphan) {
+ if (is_orphan) {
ret = gen_unique_name(sctx, pm->ino,
pm->gen, from_path);
} else {
@@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
goto out;
sctx->send_progress = sctx->cur_ino + 1;
+ ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ LIST_HEAD(deleted_refs);
+ ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+ ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+ &pm->update_refs, &deleted_refs,
+ is_orphan);
+ if (ret < 0)
+ goto out;
+ if (rmdir_ino) {
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ dm->rmdir_ino = rmdir_ino;
+ }
+ goto out;
+ }
fs_path_reset(name);
to_path = name;
name = NULL;
@@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
/* already deleted */
goto finish;
}
- ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+ ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
if (ret < 0)
goto out;
if (!ret)
@@ -3204,8 +3305,18 @@ finish:
* and old parent(s).
*/
list_for_each_entry(cur, &pm->update_refs, list) {
- if (cur->dir == rmdir_ino)
+ /*
+ * The parent inode might have been deleted in the send snapshot
+ */
+ ret = get_inode_info(sctx->send_root, cur->dir, NULL,
+ NULL, NULL, NULL, NULL, NULL);
+ if (ret == -ENOENT) {
+ ret = 0;
continue;
+ }
+ if (ret < 0)
+ goto out;
+
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
@@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
u64 left_gen;
u64 right_gen;
int ret = 0;
+ struct waiting_dir_move *wdm;
if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
return 0;
@@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- if (is_waiting_for_move(sctx, di_key.objectid)) {
+ wdm = get_waiting_dir_move(sctx, di_key.objectid);
+ if (wdm && !wdm->orphanized) {
ret = add_pending_dir_move(sctx,
sctx->cur_ino,
sctx->cur_inode_gen,
@@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
ret = is_ancestor(sctx->parent_root,
sctx->cur_ino, sctx->cur_inode_gen,
ino, path_before);
- break;
+ if (ret)
+ break;
}
fs_path_reset(path_before);
@@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
goto out;
if (ret) {
struct name_cache_entry *nce;
+ struct waiting_dir_move *wdm;
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
goto out;
+
+ /*
+ * If ow_inode has its rename operation delayed
+ * make sure that its orphanized name is used in
+ * the source path when performing its rename
+ * operation.
+ */
+ if (is_waiting_for_move(sctx, ow_inode)) {
+ wdm = get_waiting_dir_move(sctx,
+ ow_inode);
+ ASSERT(wdm);
+ wdm->orphanized = true;
+ }
+
/*
* Make sure we clear our orphanized inode's
* name from the name cache. This is because the
@@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
name_cache_delete(sctx, nce);
kfree(nce);
}
+
+ /*
+ * ow_inode might currently be an ancestor of
+ * cur_ino, therefore compute valid_path (the
+ * current path of cur_ino) again because it
+ * might contain the pre-orphanization name of
+ * ow_inode, which is no longer valid.
+ */
+ fs_path_reset(valid_path);
+ ret = get_cur_path(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen, valid_path);
+ if (ret < 0)
+ goto out;
} else {
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
@@ -4126,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx,
}
btrfs_release_path(path);
+ /*
+ * We don't actually care about pending_move as we are simply
+ * re-creating this inode and will be rename'ing it into place once we
+ * rename the parent directory.
+ */
ret = process_recorded_refs(sctx, &pending_move);
- /* Only applicable to an incremental send. */
- ASSERT(pending_move == 0);
-
out:
btrfs_free_path(path);
return ret;
@@ -5602,7 +5746,10 @@ static int changed_ref(struct send_ctx *sctx,
{
int ret = 0;
- BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "reference");
+ return -EIO;
+ }
if (!sctx->cur_inode_new_gen &&
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
@@ -5627,7 +5774,10 @@ static int changed_xattr(struct send_ctx *sctx,
{
int ret = 0;
- BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "xattr");
+ return -EIO;
+ }
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result == BTRFS_COMPARE_TREE_NEW)
@@ -5651,7 +5801,10 @@ static int changed_extent(struct send_ctx *sctx,
{
int ret = 0;
- BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "extent");
+ return -EIO;
+ }
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 864ce334f696..4071fe2bd098 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+ root->fs_info->fs_frozen = 1;
+ /*
+ * We don't need a barrier here, we'll wait for any transaction that
+ * could be in progress on other threads (and do delayed iputs that
+ * we want to avoid on a frozen filesystem), or do the commit
+ * ourselves.
+ */
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
@@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans, root);
}
+static int btrfs_unfreeze(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+ root->fs_info->fs_frozen = 0;
+ return 0;
+}
+
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = {
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
+ .unfreeze_fs = btrfs_unfreeze,
};
static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9cca0a721961..95d41919d034 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ /*
+ * If fs has been frozen, we can not handle delayed iputs, otherwise
+ * it'll result in deadlock about SB_FREEZE_FS.
+ */
if (current != root->fs_info->transaction_kthread &&
- current != root->fs_info->cleaner_kthread)
+ current != root->fs_info->cleaner_kthread &&
+ !root->fs_info->fs_frozen)
btrfs_run_delayed_iputs(root);
return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d31a0c4f56be..ef9c55bc7907 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
#include "backref.h"
#include "hash.h"
#include "compression.h"
+#include "qgroup.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
offset = key->offset - btrfs_file_extent_offset(eb, item);
+ /*
+ * Manually record dirty extent, as here we did a shallow
+ * file extent item copy and skip normal backref update,
+ * but modifying extent tree all by ourselves.
+ * So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree
+ * (doesn't affect qgroup) to fs/file tree(affects qgroup)
+ */
+ ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+ btrfs_file_extent_disk_bytenr(eb, item),
+ btrfs_file_extent_disk_num_bytes(eb, item),
+ GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
if (ins.objectid > 0) {
u64 csum_start;
u64 csum_end;
@@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
- btrfs_init_log_ctx(&root_log_ctx);
+ btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
atomic_inc(&log_root_tree->log_batch);
@@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
blk_finish_plug(&plug);
+ list_del_init(&root_log_ctx.list);
mutex_unlock(&log_root_tree->log_mutex);
ret = root_log_ctx.log_ret;
goto out;
@@ -4469,7 +4486,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
- struct inode *inode)
+ struct inode *inode,
+ u64 *other_ino)
{
int ret;
struct btrfs_path *search_path;
@@ -4528,7 +4546,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
search_path, parent,
name, this_name_len, 0);
if (di && !IS_ERR(di)) {
- ret = 1;
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+ di, &di_key);
+ if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ } else {
+ ret = -EAGAIN;
+ }
goto out;
} else if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -4722,16 +4749,72 @@ again:
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
BTRFS_I(inode)->generation == trans->transid) {
+ u64 other_ino = 0;
+
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0],
- &min_key, inode);
+ &min_key, inode,
+ &other_ino);
if (ret < 0) {
err = ret;
goto out_unlock;
- } else if (ret > 0) {
- err = 1;
- btrfs_set_log_full_commit(root->fs_info, trans);
- goto out_unlock;
+ } else if (ret > 0 && ctx &&
+ other_ino != btrfs_ino(ctx->inode)) {
+ struct btrfs_key inode_key;
+ struct inode *other_inode;
+
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
+ ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+ btrfs_release_path(path);
+ inode_key.objectid = other_ino;
+ inode_key.type = BTRFS_INODE_ITEM_KEY;
+ inode_key.offset = 0;
+ other_inode = btrfs_iget(root->fs_info->sb,
+ &inode_key, root,
+ NULL);
+ /*
+ * If the other inode that had a conflicting dir
+ * entry was deleted in the current transaction,
+ * we don't need to do more work nor fallback to
+ * a transaction commit.
+ */
+ if (IS_ERR(other_inode) &&
+ PTR_ERR(other_inode) == -ENOENT) {
+ goto next_key;
+ } else if (IS_ERR(other_inode)) {
+ err = PTR_ERR(other_inode);
+ goto out_unlock;
+ }
+ /*
+ * We are safe logging the other inode without
+ * acquiring its i_mutex as long as we log with
+ * the LOG_INODE_EXISTS mode. We're safe against
+ * concurrent renames of the other inode as well
+ * because during a rename we pin the log and
+ * update the log with the new name before we
+ * unpin it.
+ */
+ err = btrfs_log_inode(trans, root, other_inode,
+ LOG_INODE_EXISTS,
+ 0, LLONG_MAX, ctx);
+ iput(other_inode);
+ if (err)
+ goto out_unlock;
+ else
+ goto next_key;
}
}
@@ -4799,7 +4882,7 @@ next_slot:
ins_nr = 0;
}
btrfs_release_path(path);
-
+next_key:
if (min_key.offset < (u64)-1) {
min_key.offset++;
} else if (min_key.type < max_key.type) {
@@ -4993,8 +5076,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
- if (IS_ROOT(parent))
+ if (IS_ROOT(parent)) {
+ inode = d_inode(parent);
+ if (btrfs_must_commit_transaction(trans, inode))
+ ret = 1;
break;
+ }
parent = dget_parent(parent);
dput(old_parent);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a9f1b75d080d..ab858e31ccbc 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -30,15 +30,18 @@ struct btrfs_log_ctx {
int log_transid;
int io_err;
bool log_new_dentries;
+ struct inode *inode;
struct list_head list;
};
-static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
+ struct inode *inode)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->io_err = 0;
ctx->log_new_dentries = false;
+ ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f125508771..035efce603a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work)
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, rcu_work);
-
- if (device->bdev)
- blkdev_put(device->bdev, device->mode);
-
rcu_string_free(device->name);
kfree(device);
}
@@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head)
schedule_work(&device->rcu_work);
}
+static void btrfs_close_bdev(struct btrfs_device *device)
+{
+ if (device->bdev && device->writeable) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+
+ if (device->bdev)
+ blkdev_put(device->bdev, device->mode);
+}
+
static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
@@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->missing)
fs_devices->missing_devices--;
- if (device->bdev && device->writeable) {
- sync_blockdev(device->bdev);
- invalidate_bdev(device->bdev);
- }
+ btrfs_close_bdev(device);
new_device = btrfs_alloc_device(NULL, &device->devid,
device->uuid);
@@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
}
+ btrfs_close_bdev(device);
+
call_rcu(&device->rcu, free_device);
num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
@@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
/* zero out the old super if it is writable */
btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
+
+ btrfs_close_bdev(srcdev);
+
call_rcu(&srcdev->rcu, free_device);
/*
@@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
* the device_list_mutex lock.
*/
btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+
+ btrfs_close_bdev(tgtdev);
call_rcu(&tgtdev->rcu, free_device);
}