aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorChunguang Xu <brookxu@tencent.com>2021-11-23 09:17:57 +0800
committerTheodore Ts'o <tytso@mit.edu>2022-01-10 13:25:55 -0500
commit8c80fb312d7abf8bcd66cca1d843a80318a2c522 (patch)
tree9409145f02c987e113c7d9b53447eb9b0d34e1a5 /fs/ext4
parentext4: replace snprintf in show functions with sysfs_emit (diff)
downloadlinux-dev-8c80fb312d7abf8bcd66cca1d843a80318a2c522.tar.xz
linux-dev-8c80fb312d7abf8bcd66cca1d843a80318a2c522.zip
ext4: fix a possible ABBA deadlock due to busy PA
We found on older kernel (3.10) that in the scenario of insufficient disk space, system may trigger an ABBA deadlock problem, it seems that this problem still exists in latest kernel, try to fix it here. The main process triggered by this problem is that task A occupies the PA and waits for the jbd2 transaction finish, the jbd2 transaction waits for the completion of task B's IO (plug_list), but task B waits for the release of PA by task A to finish discard, which indirectly forms an ABBA deadlock. The related calltrace is as follows: Task A vfs_write ext4_mb_new_blocks() ext4_mb_mark_diskspace_used() JBD2 jbd2_journal_get_write_access() -> jbd2_journal_commit_transaction() ->schedule() filemap_fdatawait() | | | Task B | | do_unlinkat() | | ext4_evict_inode() | | jbd2_journal_begin_ordered_truncate() | | filemap_fdatawrite_range() | | ext4_mb_new_blocks() | -ext4_mb_discard_group_preallocations() <----- Here, try to cancel ext4_mb_discard_group_preallocations() internal retry due to PA busy, and do a limited number of retries inside ext4_mb_discard_preallocations(), which can circumvent the above problems, but also has some advantages: 1. Since the PA is in a busy state, if other groups have free PAs, keeping the current PA may help to reduce fragmentation. 2. Continue to traverse forward instead of waiting for the current group PA to be released. In most scenarios, the PA discard time can be reduced. However, in the case of smaller free space, if only a few groups have space, then due to multiple traversals of the group, it may increase CPU overhead. But in contrast, I feel that the overall benefit is better than the cost. Signed-off-by: Chunguang Xu <brookxu@tencent.com> Reported-by: kernel test robot <lkp@intel.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/1637630277-23496-1-git-send-email-brookxu.cn@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@kernel.org
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/mballoc.c40
1 files changed, 18 insertions, 22 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 215b7068f548..3dd9b9e2f967 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4814,7 +4814,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
*/
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
- ext4_group_t group, int needed)
+ ext4_group_t group, int *busy)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct buffer_head *bitmap_bh = NULL;
@@ -4822,8 +4822,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct list_head list;
struct ext4_buddy e4b;
int err;
- int busy = 0;
- int free, free_total = 0;
+ int free = 0;
mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
@@ -4846,19 +4845,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
goto out_dbg;
}
- if (needed == 0)
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
-
INIT_LIST_HEAD(&list);
-repeat:
- free = 0;
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
spin_unlock(&pa->pa_lock);
- busy = 1;
+ *busy = 1;
continue;
}
if (pa->pa_deleted) {
@@ -4898,22 +4892,13 @@ repeat:
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
- free_total += free;
-
- /* if we still need more blocks and some PAs were used, try again */
- if (free_total < needed && busy) {
- ext4_unlock_group(sb, group);
- cond_resched();
- busy = 0;
- goto repeat;
- }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
out_dbg:
mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
- free_total, group, grp->bb_free);
- return free_total;
+ free, group, grp->bb_free);
+ return free;
}
/*
@@ -5455,13 +5440,24 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
- int freed = 0;
+ int freed = 0, busy = 0;
+ int retry = 0;
trace_ext4_mb_discard_preallocations(sb, needed);
+
+ if (needed == 0)
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+ repeat:
for (i = 0; i < ngroups && needed > 0; i++) {
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
freed += ret;
needed -= ret;
+ cond_resched();
+ }
+
+ if (needed > 0 && busy && ++retry < 3) {
+ busy = 0;
+ goto repeat;
}
return freed;