From ac0c18f2c693f0e7a44dbbb36b14d5141e5d20e5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 28 Feb 2024 17:19:53 +0800 Subject: fs/writeback: avoid to writeback non-expired inode in kupdate writeback In kupdate writeback, only expired inode (have been dirty for longer than dirty_expire_interval) is supposed to be written back. However, kupdate writeback will writeback non-expired inode left in b_io or b_more_io from last wb_writeback. As a result, writeback will keep being triggered unexpected when we keep dirtying pages even dirty memory is under threshold and inode is not expired. To be more specific: Assume dirty background threshold is > 1G and dirty_expire_centisecs is > 60s. When we running fio -size=1G -invalidate=0 -ioengine=libaio --time_based -runtime=60... (keep dirtying), the writeback will keep being triggered as following: wb_workfn wb_do_writeback wb_check_background_flush /* * Wb dirty background threshold starts at 0 if device was idle and * grows up when bandwidth of wb is updated. So a background * writeback is triggered. */ wb_over_bg_thresh /* * Dirtied inode will be written back and added to b_more_io list * after slice used up (because we keep dirtying the inode). */ wb_writeback Writeback is triggered per dirty_writeback_centisecs as following: wb_workfn wb_do_writeback wb_check_old_data_flush /* * Write back inode left in b_io and b_more_io from last wb_writeback * even the inode is non-expired and it will be added to b_more_io * again as slice will be used up (because we keep dirtying the * inode) */ wb_writeback Fix this by moving non-expired inode to dirty list instead of more io list for kupdate writeback in requeue_inode. Test as following: /* make it more easier to observe the issue */ echo 300000 > /proc/sys/vm/dirty_expire_centisecs echo 100 > /proc/sys/vm/dirty_writeback_centisecs /* create a idle device */ mkfs.ext4 -F /dev/vdb mount /dev/vdb /bdi1/ /* run buffer write with fio */ fio -name test -filename=/bdi1/file -size=800M -ioengine=libaio -bs=4K \ -iodepth=1 -rw=write -direct=0 --time_based -runtime=60 -invalidate=0 Fio result before fix (run three tests): 1360MB/s 1329MB/s 1455MB/s Fio result after fix (run three tests): 1737MB/s 1729MB/s 1789MB/s Writeback for non-expired inode is gone as expeted. Observe this with trace writeback_start and writeback_written as following: echo 1 > /sys/kernel/debug/tracing/events/writeback/writeback_start/enab echo 1 > /sys/kernel/debug/tracing/events/writeback/writeback_written/enable cat /sys/kernel/tracing/trace_pipe Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240228091958.288260-2-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e4f17c53ddfc..fe634f00f4d9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1561,7 +1561,8 @@ static void inode_sleep_on_writeback(struct inode *inode) * thread's back can have unexpected consequences. */ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) + struct writeback_control *wbc, + unsigned long dirtied_before) { if (inode->i_state & I_FREEING) return; @@ -1594,7 +1595,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. */ - if (wbc->nr_to_write <= 0) { + if (wbc->nr_to_write <= 0 && + !inode_dirtied_after(inode, dirtied_before)) { /* Slice used up. Queue for next turn. */ requeue_io(inode, wb); } else { @@ -1862,6 +1864,11 @@ static long writeback_sb_inodes(struct super_block *sb, unsigned long start_time = jiffies; long write_chunk; long total_wrote = 0; /* count both pages and inodes */ + unsigned long dirtied_before = jiffies; + + if (work->for_kupdate) + dirtied_before = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -1967,7 +1974,7 @@ static long writeback_sb_inodes(struct super_block *sb, spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) total_wrote++; - requeue_inode(inode, tmp_wb, &wbc); + requeue_inode(inode, tmp_wb, &wbc, dirtied_before); inode_sync_complete(inode); spin_unlock(&inode->i_lock); -- cgit v1.2.3-59-g8ed1b From d92109891f21cf367caa2cc6dff11a4411d917f4 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 28 Feb 2024 17:19:54 +0800 Subject: fs/writeback: bail out if there is no more inodes for IO and queued once For case there is no more inodes for IO in io list from last wb_writeback, We may bail out early even there is inode in dirty list should be written back. Only bail out when we queued once to avoid missing dirtied inode. This is from code reading... Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240228091958.288260-3-shikemeng@huaweicloud.com Reviewed-by: Jan Kara [brauner@kernel.org: fold in memory corruption fix from Jan in [1]] Link: https://lore.kernel.org/r/20240405132346.bid7gibby3lxxhez@quack3 [1] Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index fe634f00f4d9..f864c7d6ef92 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2076,6 +2076,7 @@ static long wb_writeback(struct bdi_writeback *wb, struct inode *inode; long progress; struct blk_plug plug; + bool queued = false; blk_start_plug(&plug); for (;;) { @@ -2118,8 +2119,10 @@ static long wb_writeback(struct bdi_writeback *wb, dirtied_before = jiffies; trace_writeback_start(wb, work); - if (list_empty(&wb->b_io)) + if (list_empty(&wb->b_io)) { queue_io(wb, work, dirtied_before); + queued = true; + } if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else @@ -2134,7 +2137,7 @@ static long wb_writeback(struct bdi_writeback *wb, * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (progress) { + if (progress || !queued) { spin_unlock(&wb->list_lock); continue; } -- cgit v1.2.3-59-g8ed1b From 2ddc93461214507b8e50ba7218d6260be8d623d1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 28 Feb 2024 17:19:55 +0800 Subject: fs/writeback: remove unused parameter wb of finish_writeback_work Remove unused parameter wb of finish_writeback_work. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240228091958.288260-4-shikemeng@huaweicloud.com Reviewed-by: Tim Chen Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f864c7d6ef92..a754eb254535 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -166,8 +166,7 @@ static void wb_wakeup_delayed(struct bdi_writeback *wb) spin_unlock_irq(&wb->work_lock); } -static void finish_writeback_work(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static void finish_writeback_work(struct wb_writeback_work *work) { struct wb_completion *done = work->done; @@ -196,7 +195,7 @@ static void wb_queue_work(struct bdi_writeback *wb, list_add_tail(&work->list, &wb->work_list); mod_delayed_work(bdi_wq, &wb->dwork, 0); } else - finish_writeback_work(wb, work); + finish_writeback_work(work); spin_unlock_irq(&wb->work_lock); } @@ -2272,7 +2271,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) while ((work = get_next_work_item(wb)) != NULL) { trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); - finish_writeback_work(wb, work); + finish_writeback_work(work); } /* -- cgit v1.2.3-59-g8ed1b From 639924abc1ae28eb05893a402081e8e6cff23b8a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 28 Feb 2024 17:19:56 +0800 Subject: fs/writeback: only calculate dirtied_before when b_io is empty The dirtied_before is only used when b_io is not empty, so only calculate when b_io is not empty. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240228091958.288260-5-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a754eb254535..cbfb3e05e120 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2105,20 +2105,21 @@ static long wb_writeback(struct bdi_writeback *wb, spin_lock(&wb->list_lock); - /* - * Kupdate and background works are special and we want to - * include all inodes that need writing. Livelock avoidance is - * handled by these works yielding to any other work so we are - * safe. - */ - if (work->for_kupdate) { - dirtied_before = jiffies - - msecs_to_jiffies(dirty_expire_interval * 10); - } else if (work->for_background) - dirtied_before = jiffies; - trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) { + /* + * Kupdate and background works are special and we want + * to include all inodes that need writing. Livelock + * avoidance is handled by these works yielding to any + * other work so we are safe. + */ + if (work->for_kupdate) { + dirtied_before = jiffies - + msecs_to_jiffies(dirty_expire_interval * + 10); + } else if (work->for_background) + dirtied_before = jiffies; + queue_io(wb, work, dirtied_before); queued = true; } -- cgit v1.2.3-59-g8ed1b From ba679de9c3fc511f457ea0ad8f5a22e9152fa07b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 28 Feb 2024 17:19:57 +0800 Subject: fs/writeback: correct comment of __wakeup_flusher_threads_bdi Commit e8e8a0c6c9bfc ("writeback: move nr_pages == 0 logic to one location") removed parameter nr_pages of __wakeup_flusher_threads_bdi and we try to writeback all dirty pages in __wakeup_flusher_threads_bdi now. Just correct stale comment. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240228091958.288260-6-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cbfb3e05e120..e853da9a7fbb 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2332,8 +2332,7 @@ void wb_workfn(struct work_struct *work) } /* - * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero, - * write back the whole world. + * Start writeback of all dirty pages on this bdi. */ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) -- cgit v1.2.3-59-g8ed1b From 6a1ee87176ffed1d9e749bc66a2ad85be2d7dbb7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 28 Feb 2024 17:19:58 +0800 Subject: fs/writeback: remove unnecessary return in writeback_inodes_sb writeback_inodes_sb doesn't have return value, just remove unnecessary return in it. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240228091958.288260-7-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e853da9a7fbb..92a5b8283528 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2735,7 +2735,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); */ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); + writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); -- cgit v1.2.3-59-g8ed1b