From 4fbcdf6694544fd9d2aedbc1e73e52b90a4fcc20 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 20 May 2015 14:01:54 +0100 Subject: Btrfs: fix -ENOSPC when finishing block group creation While creating a block group, we often end up getting ENOSPC while updating the chunk tree, which leads to a transaction abortion that produces a trace like the following: [30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]() [30670.117777] BTRFS: Transaction aborted (error -28) (...) [30670.163567] Call Trace: [30670.163906] [] dump_stack+0x4f/0x7b [30670.164522] [] ? console_unlock+0x361/0x3ad [30670.165171] [] warn_slowpath_common+0xa1/0xbb [30670.166323] [] ? __btrfs_abort_transaction+0x52/0x106 [btrfs] [30670.167213] [] warn_slowpath_fmt+0x46/0x48 [30670.167862] [] __btrfs_abort_transaction+0x52/0x106 [btrfs] [30670.169116] [] btrfs_create_pending_block_groups+0x101/0x130 [btrfs] [30670.170593] [] __btrfs_end_transaction+0x84/0x366 [btrfs] [30670.171960] [] btrfs_end_transaction+0x10/0x12 [btrfs] [30670.174649] [] btrfs_check_data_free_space+0x11f/0x27c [btrfs] [30670.176092] [] btrfs_fallocate+0x7c8/0xb96 [btrfs] [30670.177218] [] ? __this_cpu_preempt_check+0x13/0x15 [30670.178622] [] vfs_fallocate+0x14c/0x1de [30670.179642] [] ? __fget_light+0x2d/0x4f [30670.180692] [] SyS_fallocate+0x47/0x62 [30670.186737] [] system_call_fastpath+0x12/0x17 [30670.187792] ---[ end trace 0373e6b491c4a8cc ]--- This is because we don't do proper space reservation for the chunk block reserve when we have multiple tasks allocating chunks in parallel. So block group creation has 2 phases, and the first phase essentially checks if there is enough space in the system space_info, allocating a new system chunk if there isn't, while the second phase updates the device, extent and chunk trees. However, because the updates to the chunk tree happen in the second phase, if we have N tasks, each with its own transaction handle, allocating new chunks in parallel and if there is only enough space in the system space_info to allocate M chunks, where M < N, none of the tasks ends up allocating a new system chunk in the first phase and N - M tasks will get -ENOSPC when attempting to update the chunk tree in phase 2 if they need to COW any nodes/leafs from the chunk tree. Fix this by doing proper reservation in the chunk block reserve. The issue could be reproduced by running fstests generic/038 in a loop, which eventually triggered the problem. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/transaction.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/transaction.h') diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0b24755596ba..036fa83d6ccb 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -102,6 +102,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 chunk_bytes_reserved; u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; -- cgit v1.2.3-59-g8ed1b From 9086db86e0b09c39abead4d747119695553e3978 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 20 Apr 2015 09:53:50 +0800 Subject: btrfs: qgroup: Add the ability to skip given qgroup for old/new_roots. This is used by later qgroup fix patches for snapshot. As current snapshot accounting is done by btrfs_qgroup_inherit(), but new extent oriented quota mechanism will account extent from btrfs_copy_root() and other snapshot things, causing wrong result. So add this ability to handle snapshot accounting. Signed-off-by: Qu Wenruo Signed-off-by: Chris Mason --- fs/btrfs/delayed-ref.h | 8 ++++++++ fs/btrfs/qgroup.c | 8 ++++++++ fs/btrfs/transaction.c | 1 + fs/btrfs/transaction.h | 23 +++++++++++++++++++++++ 4 files changed, 40 insertions(+) (limited to 'fs/btrfs/transaction.h') diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 4016f963599e..13fb5e6090fe 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -175,6 +175,14 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manully + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c5aa0d34940e..d5f1f033b7a0 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1394,9 +1394,11 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; /* * No need to do lock, since this function will only be called in @@ -1410,6 +1412,8 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, &record->old_roots); if (ret < 0) break; + if (qgroup_to_skip) + ulist_del(record->old_roots, qgroup_to_skip, 0); node = rb_next(node); } return ret; @@ -1702,9 +1706,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; while ((node = rb_first(&delayed_refs->dirty_extent_root))) { record = rb_entry(node, struct btrfs_qgroup_extent_record, node); @@ -1719,6 +1725,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record->bytenr, (u64)-1, &new_roots); if (ret < 0) goto cleanup; + if (qgroup_to_skip) + ulist_del(new_roots, qgroup_to_skip, 0); ret = btrfs_qgroup_account_extent(trans, fs_info, record->bytenr, record->num_bytes, record->old_roots, new_roots); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6f49715cc127..3e3793dcb4c2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -232,6 +232,7 @@ loop: cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 036fa83d6ccb..eb09c2067fa8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -154,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_unlock(&BTRFS_I(inode)->lock); } +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; +} + int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, -- cgit v1.2.3-59-g8ed1b