From 07c6b5933ebf58b6132aea9f3e72a62486882bfb Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 9 Jul 2021 22:53:57 -0700 Subject: f2fs: add sysfs nodes to get GC info for each GC mode Added gc_reclaimed_segments and gc_segment_mode sysfs nodes. 1) "gc_reclaimed_segments" shows how many segments have been reclaimed by GC during a specific GC mode. 2) "gc_segment_mode" is used to control for which gc mode the "gc_reclaimed_segments" node shows. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 14 ++++++++++++++ fs/f2fs/debug.c | 9 +++++++++ fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/gc.c | 1 + fs/f2fs/sysfs.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 57 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ef4b9218ae1e..845c4be535b0 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -493,3 +493,17 @@ Contact: "Chao Yu" Description: When ATGC is on, it controls age threshold to bypass GCing young candidates whose age is not beyond the threshold, by default it was initialized as 604800 seconds (equals to 7 days). + +What: /sys/fs/f2fs//gc_reclaimed_segments +Date: July 2021 +Contact: "Daeho Jeong" +Description: Show how many segments have been reclaimed by GC during a specific + GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy, + 3: GC idle AT, 4: GC urgent high, 5: GC urgent low) + You can re-initialize this value to "0". + +What: /sys/fs/f2fs//gc_segment_mode +Date: July 2021 +Contact: "Daeho Jeong" +Description: You can control for which gc mode the "gc_reclaimed_segments" node shows. + Refer to the description of the modes in "gc_reclaimed_segments". diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 833325038ef3..53ed1e9191f0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -450,6 +450,15 @@ static int stat_show(struct seq_file *s, void *v) si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); + seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " + "Idle Greedy (%d), Idle AT (%d), " + "Urgent High (%d), Urgent Low (%d)\n", + si->sbi->gc_reclaimed_segs[GC_NORMAL], + si->sbi->gc_reclaimed_segs[GC_IDLE_CB], + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], + si->sbi->gc_reclaimed_segs[GC_IDLE_AT], + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], + si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ee8eb33e2c25..49d35e27db9f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1253,6 +1253,7 @@ enum { GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, + MAX_GC_MODE, }; enum { @@ -1733,6 +1734,10 @@ struct f2fs_sb_info { struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + /* For reclaimed segs statistics per each GC mode */ + unsigned int gc_segment_mode; /* GC state for reclaimed segments */ + unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0e42ee5f7770..d9511827dc83 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1646,6 +1646,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, force_migrate); stat_inc_seg_count(sbi, type, gc_type); + sbi->gc_reclaimed_segs[sbi->gc_mode]++; migrated++; freed: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6642246206bd..15fe30d3aeb5 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -307,6 +307,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); #endif + if (!strcmp(a->attr.name, "gc_segment_mode")) + return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + return sysfs_emit(buf, "%u\n", + sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -515,6 +523,21 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_segment_mode")) { + if (t < MAX_GC_MODE) + sbi->gc_segment_mode = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + if (t != 0) + return -EINVAL; + sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0; + return count; + } + *ui = (unsigned int)t; return count; @@ -740,6 +763,9 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_cou F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -812,6 +838,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(atgc_candidate_count), ATTR_LIST(atgc_age_weight), ATTR_LIST(atgc_age_threshold), + ATTR_LIST(gc_segment_mode), + ATTR_LIST(gc_reclaimed_segments), NULL, }; ATTRIBUTE_GROUPS(f2fs); -- cgit v1.2.3-59-g8ed1b From 01f6afd0f3ccaa2d5f7fb87e7bd910dc17eef06b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 10 Jul 2021 08:21:41 +0800 Subject: f2fs: compress: fix to set zstd compress level correctly As 5kft reported in [1]: set_compress_context() should set compress level into .i_compress_flag for zstd as well as lz4hc, otherwise, zstd compressor will still use default zstd compress level during compression, fix it. [1] https://lore.kernel.org/linux-f2fs-devel/8e29f52b-6b0d-45ec-9520-e63eb254287a@www.fastmail.com/T/#u Fixes: 3fde13f817e2 ("f2fs: compress: support compress level") Reported-by: 5kft <5kft@5kft.org> Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 49d35e27db9f..867f2c5d9559 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4142,7 +4142,8 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; - if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || + F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) F2FS_I(inode)->i_compress_flag |= F2FS_OPTION(sbi).compress_level << -- cgit v1.2.3-59-g8ed1b From 5417c98c12f6eeb1252130bcea3b943f5e273be7 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Fri, 9 Jul 2021 16:34:53 +0800 Subject: f2fs: avoid to create an empty string as the extension_list When creating a file, we need to set the temperature based on extension_list. If the empty string is a valid extension_list, the is_extension_exist will always returns true, which affects the separation of hot and cold. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 15fe30d3aeb5..b1725620c07d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -351,7 +351,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, set = false; } - if (strlen(name) >= F2FS_EXTENSION_LEN) + if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; down_write(&sbi->sb_lock); -- cgit v1.2.3-59-g8ed1b From 10d0786b39b3b91c4fbf8c2926e97ab456a4eea1 Mon Sep 17 00:00:00 2001 From: Jia Yang Date: Wed, 14 Jul 2021 15:46:06 +0800 Subject: f2fs: Revert "f2fs: Fix indefinite loop in f2fs_gc() v1" This reverts commit 957fa47823dfe449c5a15a944e4e7a299a6601db. The patch "f2fs: Fix indefinite loop in f2fs_gc()" v1 and v4 are all merged. Patch v4 is test info for patch v1. Patch v1 doesn't work and may cause that sbi->cur_victim_sec can't be resetted to NULL_SEGNO, which makes SSR unable to get segment of sbi->cur_victim_sec. So it should be reverted. The mails record: [1] https://lore.kernel.org/linux-f2fs-devel/7288dcd4-b168-7656-d1af-7e2cafa4f720@huawei.com/T/ [2] https://lore.kernel.org/linux-f2fs-devel/20190809153653.GD93481@jaegeuk-macbookpro.roam.corp.google.com/T/ Signed-off-by: Jia Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d9511827dc83..9dce44619069 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1748,7 +1748,7 @@ gc_more: round++; } - if (gc_type == FG_GC && seg_freed) + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (sync) -- cgit v1.2.3-59-g8ed1b From 1ffc8f5f7751f91fe6af527d426a723231b741a6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Jul 2021 16:14:02 -0700 Subject: f2fs: let's keep writing IOs on SBI_NEED_FSCK SBI_NEED_FSCK is an indicator that fsck.f2fs needs to be triggered, so it is not fully critical to stop any IO writes. So, let's allow to write data instead of reporting EIO forever given SBI_NEED_FSCK, but do keep OPU. Fixes: 955772787667 ("f2fs: drop inplace IO if fs status is abnormal") Cc: # v5.13+ Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ fs/f2fs/segment.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d2cf48c5a2e4..ba120d55e9b1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2498,6 +2498,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 15cc89eef28d..f9b7fb785e1d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3563,7 +3563,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { + if (f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } -- cgit v1.2.3-59-g8ed1b From 9de71ede81e6d1a111fdd868b2d78d459fa77f80 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 19 Jul 2021 16:46:47 +0800 Subject: f2fs: quota: fix potential deadlock xfstest generic/587 reports a deadlock issue as below: ====================================================== WARNING: possible circular locking dependency detected 5.14.0-rc1 #69 Not tainted ------------------------------------------------------ repquota/8606 is trying to acquire lock: ffff888022ac9320 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}, at: f2fs_quota_sync+0x207/0x300 [f2fs] but task is already holding lock: ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&sbi->quota_sem){.+.+}-{3:3}: __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_read+0x3b/0x2a0 f2fs_quota_sync+0x59/0x300 [f2fs] f2fs_quota_on+0x48/0x100 [f2fs] do_quotactl+0x5e3/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #1 (&sbi->cp_rwsem){++++}-{3:3}: __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_read+0x3b/0x2a0 f2fs_unlink+0x353/0x670 [f2fs] vfs_unlink+0x1c7/0x380 do_unlinkat+0x413/0x4b0 __x64_sys_unlinkat+0x50/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #0 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}: check_prev_add+0xdc/0xb30 validate_chain+0xa67/0xb20 __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_write+0x39/0xc0 f2fs_quota_sync+0x207/0x300 [f2fs] do_quotactl+0xaff/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae other info that might help us debug this: Chain exists of: &sb->s_type->i_mutex_key#18 --> &sbi->cp_rwsem --> &sbi->quota_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sbi->quota_sem); lock(&sbi->cp_rwsem); lock(&sbi->quota_sem); lock(&sb->s_type->i_mutex_key#18); *** DEADLOCK *** 3 locks held by repquota/8606: #0: ffff88801efac0e0 (&type->s_umount_key#53){++++}-{3:3}, at: user_get_super+0xd9/0x190 #1: ffff8880084bc380 (&sbi->cp_rwsem){++++}-{3:3}, at: f2fs_quota_sync+0x3e/0x300 [f2fs] #2: ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs] stack backtrace: CPU: 6 PID: 8606 Comm: repquota Not tainted 5.14.0-rc1 #69 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack_lvl+0xce/0x134 dump_stack+0x17/0x20 print_circular_bug.isra.0.cold+0x239/0x253 check_noncircular+0x1be/0x1f0 check_prev_add+0xdc/0xb30 validate_chain+0xa67/0xb20 __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_write+0x39/0xc0 f2fs_quota_sync+0x207/0x300 [f2fs] do_quotactl+0xaff/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f883b0b4efe The root cause is ABBA deadlock of inode lock and cp_rwsem, reorder locks in f2fs_quota_sync() as below to fix this issue: - lock inode - lock cp_rwsem - lock quota_sem Fixes: db6ec53b7e03 ("f2fs: add a rw_sem to cover quota flag changes") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 84 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8fecd3050ccd..72eb9d70969f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2518,6 +2518,33 @@ static int f2fs_enable_quotas(struct super_block *sb) return 0; } +static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) +{ + struct quota_info *dqopt = sb_dqopt(sbi->sb); + struct address_space *mapping = dqopt->files[type]->i_mapping; + int ret = 0; + + ret = dquot_writeback_dquots(sbi->sb, type); + if (ret) + goto out; + + ret = filemap_fdatawrite(mapping); + if (ret) + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + goto out; + + ret = filemap_fdatawait(mapping); + + truncate_inode_pages(&dqopt->files[type]->i_data, 0); +out: + if (ret) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + int f2fs_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -2525,57 +2552,42 @@ int f2fs_quota_sync(struct super_block *sb, int type) int cnt; int ret; - /* - * do_quotactl - * f2fs_quota_sync - * down_read(quota_sem) - * dquot_writeback_dquots() - * f2fs_dquot_commit - * block_operation - * down_read(quota_sem) - */ - f2fs_lock_op(sbi); - - down_read(&sbi->quota_sem); - ret = dquot_writeback_dquots(sb, type); - if (ret) - goto out; - /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - struct address_space *mapping; if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mapping = dqopt->files[cnt]->i_mapping; + if (!sb_has_quota_active(sb, type)) + return 0; - ret = filemap_fdatawrite(mapping); - if (ret) - goto out; + inode_lock(dqopt->files[cnt]); - /* if we are using journalled quota */ - if (is_journalled_quota(sbi)) - continue; + /* + * do_quotactl + * f2fs_quota_sync + * down_read(quota_sem) + * dquot_writeback_dquots() + * f2fs_dquot_commit + * block_operation + * down_read(quota_sem) + */ + f2fs_lock_op(sbi); + down_read(&sbi->quota_sem); - ret = filemap_fdatawait(mapping); - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + ret = f2fs_quota_sync_file(sbi, cnt); + + up_read(&sbi->quota_sem); + f2fs_unlock_op(sbi); - inode_lock(dqopt->files[cnt]); - truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); + + if (ret) + break; } -out: - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); - f2fs_unlock_op(sbi); return ret; } -- cgit v1.2.3-59-g8ed1b From 3e679dc78c17825a55e6f2cdb9078e375c945b15 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:11 -0500 Subject: f2fs: make f2fs_write_failed() take struct inode Make f2fs_write_failed() take a 'struct inode' directly rather than a 'struct address_space', as this simplifies it slightly. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ba120d55e9b1..1803c68fa269 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3178,9 +3178,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct address_space *mapping, loff_t to) +static void f2fs_write_failed(struct inode *inode, loff_t to) { - struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (IS_NOQUOTA(inode)) @@ -3412,7 +3411,7 @@ repeat: fail: f2fs_put_page(page, 1); - f2fs_write_failed(mapping, pos + len); + f2fs_write_failed(inode, pos + len); if (drop_atomic) f2fs_drop_inmem_pages_all(sbi, false); return err; @@ -3602,7 +3601,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, count - iov_iter_count(iter)); } else if (err < 0) { - f2fs_write_failed(mapping, offset + count); + f2fs_write_failed(inode, offset + count); } } else { if (err > 0) -- cgit v1.2.3-59-g8ed1b From 6de8687ccdefed40d617492f4e1b3962eb577b6b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:12 -0500 Subject: f2fs: remove allow_outplace_dio() We can just check f2fs_lfs_mode() directly. The block_unaligned_IO() check is redundant because in LFS mode, f2fs doesn't do direct I/O writes that aren't block-aligned (due to f2fs_force_buffered_io() returning true in this case, triggering the fallback to buffered I/O). Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 10 ---------- fs/f2fs/file.c | 2 +- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1803c68fa269..28ad1f533c2a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3553,7 +3553,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (f2fs_force_buffered_io(inode, iocb, iter)) return 0; - do_opu = allow_outplace_dio(inode, iocb, iter); + do_opu = rw == WRITE && f2fs_lfs_mode(sbi); trace_f2fs_direct_IO_enter(inode, offset, count, rw); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 867f2c5d9559..8459b6d5a2f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4311,16 +4311,6 @@ static inline int block_unaligned_IO(struct inode *inode, return align & blocksize_mask; } -static inline int allow_outplace_dio(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - return (f2fs_lfs_mode(sbi) && (rw == WRITE) && - !block_unaligned_IO(inode, iocb, iter)); -} - static inline bool f2fs_force_buffered_io(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6afd4562335f..b1cb5b50faac 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4292,7 +4292,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * back to buffered IO. */ if (!f2fs_force_buffered_io(inode, iocb, from) && - allow_outplace_dio(inode, iocb, from)) + f2fs_lfs_mode(F2FS_I_SB(inode))) goto write; } preallocated = true; -- cgit v1.2.3-59-g8ed1b From 2eeb0dce728a7eac3e4dfe355d98af40d61f7a26 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 22 Jul 2021 10:30:58 -0700 Subject: f2fs: don't sleep while grabing nat_tree_lock This tries to fix priority inversion in the below condition resulting in long checkpoint delay. f2fs_get_node_info() - nat_tree_lock -> sleep to grab journal_rwsem by contention checkpoint - waiting for nat_tree_lock In order to let checkpoint go, let's release nat_tree_lock, if there's a journal_rwsem contention. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0be9e2d7120e..c945a9730f3c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, int i; ni->nid = nid; - +retry: /* Check nat cache */ down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); @@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, return 0; } - memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + /* + * Check current segment summary by trying to grab journal_rwsem first. + * This sem is on the critical path on the checkpoint requiring the above + * nat_tree_lock. Therefore, we should retry, if we failed to grab here + * while not bothering checkpoint. + */ + if (!rwsem_is_locked(&sbi->cp_global_sem)) { + down_read(&curseg->journal_rwsem); + } else if (!down_read_trylock(&curseg->journal_rwsem)) { + up_read(&nm_i->nat_tree_lock); + goto retry; + } - /* Check current segment summary */ - down_read(&curseg->journal_rwsem); i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); -- cgit v1.2.3-59-g8ed1b From 7eab7a6968278c735b1ca6387056a408f7960265 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Tue, 22 Jun 2021 19:50:59 +0800 Subject: f2fs: compress: remove unneeded read when rewrite whole cluster when we overwrite the whole page in cluster, we don't need read original data before write, because after write_end(), writepages() can help to load left data in that cluster. Signed-off-by: Fengnan Chang Signed-off-by: Chao Yu Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 28ad1f533c2a..d7f9d66577f5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3329,6 +3329,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *fsdata = NULL; + if (len == PAGE_SIZE) + goto repeat; + ret = f2fs_prepare_compress_overwrite(inode, pagep, index, fsdata); if (ret < 0) { -- cgit v1.2.3-59-g8ed1b From b7ec2061737f12c33e45beeb967d17f31abc1ada Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 26 Jul 2021 09:12:15 -0700 Subject: f2fs: do not submit NEW_ADDR to read node block After the below patch, give cp is errored, we drop dirty node pages. This can give NEW_ADDR to read node pages. Don't do WARN_ON() which gives generic/475 failure. Fixes: 28607bf3aa6f ("f2fs: drop dirty node pages when cp is in error status") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c945a9730f3c..5840b82ce311 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1330,7 +1330,8 @@ static int read_node_page(struct page *page, int op_flags) if (err) return err; - if (unlikely(ni.blk_addr == NULL_ADDR) || + /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; -- cgit v1.2.3-59-g8ed1b From 093f0bac32b617960899c7e00f4550373c383dd0 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sun, 25 Jul 2021 21:18:19 -0700 Subject: f2fs: change fiemap way in printing compression chunk When we print out a discontinuous compression chunk, it shows like a continuous chunk now. To show it more correctly, I've changed the way of printing fiemap info like below. Plus, eliminated NEW_ADDR(-1) in fiemap info, since it is not in fiemap user api manual. Let's assume 16KB compression cluster. Logical Physical Length Flags 0: 0000000000000000 00000002c091f000 0000000000004000 1008 1: 0000000000004000 00000002c0920000 0000000000004000 1008 ... 9: 0000000000034000 0000000f8c623000 0000000000004000 1008 10: 0000000000038000 000000101a6eb000 0000000000004000 1008 0: 0000000000000000 00000002c091f000 0000000000004000 1008 1: 0000000000004000 00000002c0920000 0000000000004000 1008 ... 9: 0000000000034000 0000000f8c623000 0000000000001000 1008 10: 0000000000035000 000000101a6ea000 0000000000003000 1008 11: 0000000000038000 000000101a6eb000 0000000000002000 1008 12: 000000000003a000 00000002c3544000 0000000000002000 1008 Flags 0x1000 => FIEMAP_EXTENT_MERGED 0x0008 => FIEMAP_EXTENT_ENCODED Signed-off-by: Daeho Jeong Tested-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 75 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d7f9d66577f5..cb71d7317ad2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1843,8 +1843,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; - bool compr_cluster = false; + bool compr_cluster = false, compr_appended; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int count_in_cluster = 0; loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { @@ -1892,15 +1893,17 @@ next: map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; - if (compr_cluster) - map.m_len = cluster_size - 1; + if (compr_cluster) { + map.m_lblk += 1; + map.m_len = cluster_size - count_in_cluster; + } ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) { + if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, @@ -1910,6 +1913,14 @@ next: flags |= FIEMAP_EXTENT_LAST; } + compr_appended = false; + /* In a case of compressed cluster, append this to the last extent */ + if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) || + !(map.m_flags & F2FS_MAP_FLAGS))) { + compr_appended = true; + goto skip_fill; + } + if (size) { flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) @@ -1926,38 +1937,36 @@ next: if (start_blk > last_blk) goto out; - if (compr_cluster) { - compr_cluster = false; - - - logical = blks_to_bytes(inode, start_blk - 1); - phys = blks_to_bytes(inode, map.m_pblk); - size = blks_to_bytes(inode, cluster_size); - - flags |= FIEMAP_EXTENT_ENCODED; - - start_blk += cluster_size - 1; - - if (start_blk > last_blk) - goto out; - - goto prep_next; - } - +skip_fill: if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; - start_blk++; - goto prep_next; - } - - logical = blks_to_bytes(inode, start_blk); - phys = blks_to_bytes(inode, map.m_pblk); - size = blks_to_bytes(inode, map.m_len); - flags = 0; - if (map.m_flags & F2FS_MAP_UNWRITTEN) - flags = FIEMAP_EXTENT_UNWRITTEN; + count_in_cluster = 1; + } else if (compr_appended) { + unsigned int appended_blks = cluster_size - + count_in_cluster + 1; + size += blks_to_bytes(inode, appended_blks); + start_blk += appended_blks; + compr_cluster = false; + } else { + logical = blks_to_bytes(inode, start_blk); + phys = __is_valid_data_blkaddr(map.m_pblk) ? + blks_to_bytes(inode, map.m_pblk) : 0; + size = blks_to_bytes(inode, map.m_len); + flags = 0; + + if (compr_cluster) { + flags = FIEMAP_EXTENT_ENCODED; + count_in_cluster += map.m_len; + if (count_in_cluster == cluster_size) { + compr_cluster = false; + size += blks_to_bytes(inode, 1); + } + } else if (map.m_flags & F2FS_MAP_UNWRITTEN) { + flags = FIEMAP_EXTENT_UNWRITTEN; + } - start_blk += bytes_to_blks(inode, size); + start_blk += bytes_to_blks(inode, size); + } prep_next: cond_resched(); -- cgit v1.2.3-59-g8ed1b From 4931e0c93e124357308893a3e5e224cbeeabc721 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 28 Jul 2021 12:38:11 -0700 Subject: f2fs: turn back remapped address in compressed page endio Turned back the remmaped sector address to the address in the partition, when ending io, for compress cache to work properly. Fixes: 6ce19aff0b8c ("f2fs: compress: add compress_inode to cache compressed blocks") Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Signed-off-by: Hyeong Jun Kim Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cb71d7317ad2..948083c88d17 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -116,6 +116,7 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + block_t fs_blkaddr; }; static void f2fs_finish_read_bio(struct bio *bio) @@ -228,7 +229,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) struct bio_vec *bv; struct bvec_iter_all iter_all; bool all_compressed = true; - block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector); + block_t blkaddr = ctx->fs_blkaddr; bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; @@ -1003,6 +1004,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->bio = bio; ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; + ctx->fs_blkaddr = blkaddr; bio->bi_private = ctx; } -- cgit v1.2.3-59-g8ed1b From 2e650912c037a501ea6fc367c7075ead63a114f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 30 Jul 2021 08:10:48 -0700 Subject: f2fs: show sbi status in debugfs/f2fs/status We need to get sbi->s_flag to understand the current f2fs status as well. One example is SBI_NEED_FSCK. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 53ed1e9191f0..473ad04d1891 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -333,11 +333,12 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s (sbi: 0x%lx)]=====\n", si->sbi->sb->s_bdev, i++, f2fs_readonly(si->sbi->sb) ? "RO": "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good")); + "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"), + si->sbi->s_flag); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", -- cgit v1.2.3-59-g8ed1b From 277afbde6ca2b38729683fc17c031b4bc942068d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 29 Jul 2021 09:22:17 +0800 Subject: f2fs: fix wrong checkpoint_changed value in f2fs_remount() In f2fs_remount(), return value of test_opt() is an unsigned int type variable, however when we compare it to a bool type variable, it cause wrong result, fix it. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 72eb9d70969f..3617aa5f0477 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2062,11 +2062,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); + bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); - bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2111,8 +2110,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = parse_options(sb, data, true); if (err) goto restore_opts; - checkpoint_changed = - disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT); /* * Previous and new state of filesystem is RO, @@ -2234,7 +2231,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_flush = true; } - if (checkpoint_changed) { + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) -- cgit v1.2.3-59-g8ed1b From 2787991516468bfafafb9bf2b45a848e6b202e7c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Jul 2021 09:03:29 +0800 Subject: f2fs: fix to force keeping write barrier for strict fsync mode [1] https://www.mail-archive.com/linux-f2fs-devel@lists.sourceforge.net/msg15126.html As [1] reported, if lower device doesn't support write barrier, in below case: - write page #0; persist - overwrite page #0 - fsync - write data page #0 OPU into device's cache - write inode page into device's cache - issue flush If SPO is triggered during flush command, inode page can be persisted before data page #0, so that after recovery, inode page can be recovered with new physical block address of data page #0, however there may contains dummy data in new physical block address. Then what user will see is: after overwrite & fsync + SPO, old data in file was corrupted, if any user do care about such case, we can suggest user to use STRICT fsync mode, in this mode, we will force to use atomic write sematics to keep write order in between data/node and last node, so that it avoids potential data corruption during fsync(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b1cb5b50faac..e931782abdab 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -301,6 +301,18 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; + } else { + /* + * for OPU case, during fsync(), node can be persisted before + * data when lower device doesn't support write barrier, result + * in data corruption after SPO. + * So for strict fsync mode, force to use atomic write sematics + * to keep write order in between data/node and last node to + * avoid potential data corruption. + */ + if (F2FS_OPTION(sbi).fsync_mode == + FSYNC_MODE_STRICT && !atomic) + atomic = true; } go_write: /* -- cgit v1.2.3-59-g8ed1b From dc675a97129c4d9d5af55a3d7f23d7e092b8e032 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Sat, 31 Jul 2021 11:26:46 +0800 Subject: f2fs: fix min_seq_blocks can not make sense in some scenes. F2FS have dirty page count control for batched sequential write in writepages, and get the value of min_seq_blocks by blocks_per_seg * segs_per_sec(segs_per_sec defaults to 1). But in some scenes we set a lager section size, Min_seq_blocks will become too large to achieve the expected effect(eg. 4thread sequential write, the number of merge requests will be reduced). Signed-off-by: Laibin Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f9b7fb785e1d..0f976cefe425 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -5159,7 +5159,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; + sm_info->min_seq_blocks = sbi->blocks_per_seg; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); -- cgit v1.2.3-59-g8ed1b From 4f993264fe2965c344f223d854ccbb549b16ed71 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 3 Aug 2021 08:15:43 +0800 Subject: f2fs: introduce discard_unit mount option As James Z reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=213877 [1.] One-line summary of the problem: Mount multiple SMR block devices exceed certain number cause system non-response [2.] Full description of the problem/report: Created some F2FS on SMR devices (mkfs.f2fs -m), then mounted in sequence. Each device is the same Model: HGST HSH721414AL (Size 14TB). Empirically, found that when the amount of SMR device * 1.5Gb > System RAM, the system ran out of memory and hung. No dmesg output. For example, 24 SMR Disk need 24*1.5GB = 36GB. A system with 32G RAM can only mount 21 devices, the 22nd device will be a reproducible cause of system hang. The number of SMR devices with other FS mounted on this system does not interfere with the result above. [3.] Keywords (i.e., modules, networking, kernel): F2FS, SMR, Memory [4.] Kernel information [4.1.] Kernel version (uname -a): Linux 5.13.4-200.fc34.x86_64 #1 SMP Tue Jul 20 20:27:29 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux [4.2.] Kernel .config file: Default Fedora 34 with f2fs-tools-1.14.0-2.fc34.x86_64 [5.] Most recent kernel version which did not have the bug: None [6.] Output of Oops.. message (if applicable) with symbolic information resolved (see Documentation/admin-guide/oops-tracing.rst) None [7.] A small shell script or example program which triggers the problem (if possible) mount /dev/sdX /mnt/0X [8.] Memory consumption With 24 * 14T SMR Block device with F2FS free -g total used free shared buff/cache available Mem: 46 36 0 0 10 10 Swap: 0 0 0 With 3 * 14T SMR Block device with F2FS free -g total used free shared buff/cache available Mem: 7 5 0 0 1 1 Swap: 7 0 7 The root cause is, there are three bitmaps: - cur_valid_map - ckpt_valid_map - discard_map and each of them will cost ~500MB memory, {cur, ckpt}_valid_map are necessary, but discard_map is optional, since this bitmap will only be useful in mountpoint that small discard is enabled. For a blkzoned device such as SMR or ZNS devices, f2fs will only issue discard for a section(zone) when all blocks of that section are invalid, so, for such device, we don't need small discard functionality at all. This patch introduces a new mountoption "discard_unit=block|segment| section" to support issuing discard with different basic unit which is aligned to block, segment or section, so that user can specify "discard_unit=segment" or "discard_unit=section" to disable small discard functionality. Note that this mount option can not be changed by remount() due to related metadata need to be initialized during mount(). In order to save memory, let's use "discard_unit=section" for blkzoned device by default. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 8 ++++ fs/f2fs/f2fs.h | 16 ++++++++ fs/f2fs/segment.c | 82 ++++++++++++++++++++++++-------------- fs/f2fs/super.c | 54 +++++++++++++++++++++++-- fs/f2fs/sysfs.c | 2 + 5 files changed, 130 insertions(+), 32 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index ff9e7cc97c65..8f251a662542 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -312,6 +312,14 @@ inlinecrypt When possible, encrypt/decrypt the contents of encrypted Documentation/block/inline-encryption.rst. atgc Enable age-threshold garbage collection, it provides high effectiveness and efficiency on background GC. +discard_unit=%s Control discard unit, the argument can be "block", "segment" + and "section", issued discard command's offset/size will be + aligned to the unit, by default, "discard_unit=block" is set, + so that small discard functionality is enabled. + For blkzoned device, "discard_unit=section" will be set by + default, it is helpful for large sized SMR or ZNS devices to + reduce memory cost by getting rid of fs metadata supports small + discard. ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8459b6d5a2f8..8d4665a7a0fb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -139,6 +139,11 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ + int discard_unit; /* + * discard command's offset/size should + * be aligned to this unit: block, + * segment or section + */ struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be @@ -1299,6 +1304,12 @@ enum { */ }; +enum { + DISCARD_UNIT_BLOCK, /* basic discard unit is block */ + DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ + DISCARD_UNIT_SECTION, /* basic discard unit is section */ +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -4365,6 +4376,11 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0f976cefe425..80f26158e304 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1893,7 +1893,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } @@ -1918,7 +1919,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) || + !f2fs_block_unit_discard(sbi)) return false; if (!force) { @@ -2003,14 +2005,18 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); + bool section_alignment = F2FS_OPTION(sbi).discard_unit == + DISCARD_UNIT_SECTION; + + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; - if (need_align && end != -1) + if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) @@ -2018,7 +2024,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); - if (need_align) { + if (section_alignment) { start = rounddown(start, sbi->segs_per_sec); end = roundup(end, sbi->segs_per_sec); } @@ -2056,6 +2062,9 @@ next: } mutex_unlock(&dirty_i->seglist_lock); + if (!f2fs_block_unit_discard(sbi)) + goto wakeup; + /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; @@ -2089,6 +2098,7 @@ skip: dcc->nr_discards -= total_len; } +wakeup: wake_up_discard_thread(sbi, false); } @@ -2108,6 +2118,11 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + dcc->discard_granularity = sbi->blocks_per_seg; + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + dcc->discard_granularity = BLKS_PER_SEC(sbi); + INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); @@ -2255,7 +2270,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* @@ -2297,7 +2313,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) } } - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -4282,6 +4299,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; + unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); @@ -4304,9 +4322,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) @@ -4326,8 +4344,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap += SIT_VBLOCK_MAP_SIZE; #endif - sit_i->sentries[start].discard_map = bitmap; - bitmap += SIT_VBLOCK_MAP_SIZE; + if (discard_map) { + sit_i->sentries[start].discard_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -4489,17 +4509,19 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - /* build discard map only one time */ - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; + if (f2fs_block_unit_discard(sbi)) { + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (__is_large_section(sbi)) @@ -4535,13 +4557,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; + if (f2fs_block_unit_discard(sbi)) { + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; + } } if (__is_large_section(sbi)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3617aa5f0477..a4fed184b811 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -155,6 +155,7 @@ enum { Opt_atgc, Opt_gc_merge, Opt_nogc_merge, + Opt_discard_unit, Opt_err, }; @@ -231,6 +232,7 @@ static match_table_t f2fs_tokens = { {Opt_atgc, "atgc"}, {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, + {Opt_discard_unit, "discard_unit=%s"}, {Opt_err, NULL}, }; @@ -1173,6 +1175,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nogc_merge: clear_opt(sbi, GC_MERGE); break; + case Opt_discard_unit: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "block")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_BLOCK; + } else if (!strcmp(name, "segment")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SEGMENT; + } else if (!strcmp(name, "section")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1211,6 +1232,14 @@ default_check: return -EINVAL; } #endif + if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_OPTION(sbi).discard_unit != + DISCARD_UNIT_SECTION) { + f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } + } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_test_compress_extension(sbi)) { @@ -1925,6 +1954,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, ATGC)) seq_puts(seq, ",atgc"); + + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + return 0; } @@ -1961,10 +1998,13 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi)) + if (f2fs_sb_has_blkzoned(sbi)) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - else + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + } else { F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -2066,6 +2106,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); + bool block_unit_discard = f2fs_block_unit_discard(sbi); #ifdef CONFIG_QUOTA int i, j; #endif @@ -2166,6 +2207,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (block_unit_discard != f2fs_block_unit_discard(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch discard_unit option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -3778,7 +3825,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; - sm_i->dcc_info->discard_granularity = 1; + if (f2fs_block_unit_discard(sbi)) + sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b1725620c07d..f98afd9f9380 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -428,6 +428,8 @@ out: if (!strcmp(a->attr.name, "discard_granularity")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; if (t == *ui) return count; *ui = t; -- cgit v1.2.3-59-g8ed1b From 0f6b56ec958d49e2b3dc955cdac6b62702c04b72 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 2 Aug 2021 21:22:45 -0700 Subject: f2fs: add sysfs node to control ra_pages for fadvise seq file fadvise() allows the user to expand the readahead window to double with POSIX_FADV_SEQUENTIAL, now. But, in some use cases, it is not that sufficient and we need to meet the need in a restricted way. We can control the multiplier value of bdi device readahead between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/file.c | 30 ++++++++++++++++++++++++++++++ fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 10 ++++++++++ 5 files changed, 52 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 845c4be535b0..73211f77d11e 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -507,3 +507,9 @@ Date: July 2021 Contact: "Daeho Jeong" Description: You can control for which gc mode the "gc_reclaimed_segments" node shows. Refer to the description of the modes in "gc_reclaimed_segments". + +What: /sys/fs/f2fs//seq_file_ra_mul +Date: July 2021 +Contact: "Daeho Jeong" +Description: You can control the multiplier value of bdi device readahead window size + between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8d4665a7a0fb..1b4c482d08e2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1749,6 +1749,8 @@ struct f2fs_sb_info { unsigned int gc_segment_mode; /* GC state for reclaimed segments */ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -4003,6 +4005,9 @@ void f2fs_destroy_extent_cache(void); /* * sysfs.c */ +#define MIN_RA_MUL 2 +#define MAX_RA_MUL 256 + int __init f2fs_init_sysfs(void); void f2fs_exit_sysfs(void); int f2fs_register_sysfs(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e931782abdab..7d8ee60f6c1f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -4344,6 +4345,34 @@ out: return ret; } +static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, + int advice) +{ + struct inode *inode; + struct address_space *mapping; + struct backing_dev_info *bdi; + + if (advice == POSIX_FADV_SEQUENTIAL) { + inode = file_inode(filp); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + mapping = filp->f_mapping; + if (!mapping || len < 0) + return -EINVAL; + + bdi = inode_to_bdi(mapping->host); + filp->f_ra.ra_pages = bdi->ra_pages * + F2FS_I_SB(inode)->seq_file_ra_mul; + spin_lock(&filp->f_lock); + filp->f_mode &= ~FMODE_RANDOM; + spin_unlock(&filp->f_lock); + return 0; + } + + return generic_fadvise(filp, offset, len, advice); +} + #ifdef CONFIG_COMPAT struct compat_f2fs_gc_range { u32 sync; @@ -4472,4 +4501,5 @@ const struct file_operations f2fs_file_operations = { #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .fadvise = f2fs_file_fadvise, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a4fed184b811..84cd085020cd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3466,6 +3466,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; + sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f98afd9f9380..0954761341d7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -540,6 +540,14 @@ out: return count; } + if (!strcmp(a->attr.name, "seq_file_ra_mul")) { + if (t >= MIN_RA_MUL && t <= MAX_RA_MUL) + sbi->seq_file_ra_mul = t; + else + return -EINVAL; + return count; + } + *ui = (unsigned int)t; return count; @@ -765,6 +773,7 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_cou F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); @@ -840,6 +849,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(atgc_candidate_count), ATTR_LIST(atgc_age_weight), ATTR_LIST(atgc_age_threshold), + ATTR_LIST(seq_file_ra_mul), ATTR_LIST(gc_segment_mode), ATTR_LIST(gc_reclaimed_segments), NULL, -- cgit v1.2.3-59-g8ed1b From 91803392c732c43b5cf440e885ea89be7f5fecef Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Aug 2021 08:38:38 +0800 Subject: f2fs: fix to stop filesystem update once CP failed During f2fs_write_checkpoint(), once we failed in f2fs_flush_nat_entries() or do_checkpoint(), metadata of filesystem such as prefree bitmap, nat/sit version bitmap won't be recovered, it may cause f2fs image to be inconsistent, let's just set CP error flag to avoid further updates until we figure out a scheme to rollback all metadatas in such condition. Reported-by: Yangtao Li Signed-off-by: Yangtao Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 12 +++++++++--- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 15 +++++++++++++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6c208108d69c..7f6745f4630e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1639,8 +1639,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write cached NAT/SIT entries to NAT/SIT area */ err = f2fs_flush_nat_entries(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; + } f2fs_flush_sit_entries(sbi, cpc); @@ -1648,10 +1651,13 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_save_inmem_curseg(sbi); err = do_checkpoint(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); f2fs_release_discard_addrs(sbi); - else + } else { f2fs_clear_prefree_segments(sbi, cpc); + } f2fs_restore_inmem_curseg(sbi); stop: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1b4c482d08e2..d24fd5045712 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -547,7 +547,7 @@ enum { */ }; -#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ /* congestion wait timeout value, default: 20ms */ #define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 80f26158e304..ca9876a6d396 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -776,11 +776,22 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) return 0; for (i = 1; i < sbi->s_ndevs; i++) { + int count = DEFAULT_RETRY_IO_COUNT; + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; - ret = __submit_flush_wait(sbi, FDEV(i).bdev); - if (ret) + + do { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); + } while (ret && --count); + + if (ret) { + f2fs_stop_checkpoint(sbi, false); break; + } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); -- cgit v1.2.3-59-g8ed1b From d4bf15a7ce172d186d400d606adf4f34a59130d6 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 4 Aug 2021 11:29:46 +0800 Subject: f2fs: reduce the scope of setting fsck tag when de->name_len is zero I recently found a case where de->name_len is 0 in f2fs_fill_dentries() easily reproduced, and finally set the fsck flag. Thread A Thread B - f2fs_readdir - f2fs_read_inline_dir - ctx->pos = d.max - f2fs_add_dentry - f2fs_add_inline_entry - do_convert_inline_dir - f2fs_add_regular_entry - f2fs_readdir - f2fs_fill_dentries - set_sbi_flag(sbi, SBI_NEED_FSCK) Process A opens the folder, and has been reading without closing it. During this period, Process B created a file under the folder (occupying multiple f2fs_dir_entry, exceeding the d.max of the inline dir). After creation, process A uses the d.max of inline dir to read it again, and it will read that de->name_len is 0. And Chao pointed out that w/o inline conversion, the race condition still can happen as below: dir_entry1: A dir_entry2: B dir_entry3: C free slot: _ ctx->pos: ^ Thread A is traversing directory, ctx-pos moves to below position after readdir() by thread A: AAAABBBB___ ^ Then thread B delete dir_entry2, and create dir_entry3. Thread A calls readdir() to lookup dirents starting from middle of new dirent slots as below: AAAACCCCCC_ ^ In these scenarios, the file system is not damaged, and it's hard to avoid it. But we can bypass tagging FSCK flag if: a) bit_pos (:= ctx->pos % d->max) is non-zero and b) before bit_pos moves to first valid dir_entry. Fixes: ddf06b753a85 ("f2fs: fix to trigger fsck if dirent.name_len is zero") Signed-off-by: Yangtao Li [Chao: clean up description] Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 456651682daf..c250bf46ef5e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1000,6 +1000,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra == 1; + bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); @@ -1014,13 +1015,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de = &d->dentry[bit_pos]; if (de->name_len == 0) { + if (found_valid_dirent || !bit_pos) { + printk_ratelimited( + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } bit_pos++; ctx->pos = start_pos + bit_pos; - printk_ratelimited( - "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, sbi->sb->s_id, - le32_to_cpu(de->ino)); - set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -1063,6 +1066,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; + found_valid_dirent = true; } out: if (readdir_ra) -- cgit v1.2.3-59-g8ed1b From 6b3ba1e77d8913ee6ffb3972e889bc35550ed95c Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 5 Aug 2021 21:30:24 +0800 Subject: f2fs: Kconfig: clean up config options about compression In fs/f2fs/Kconfig, F2FS_FS_LZ4HC depends on F2FS_FS_LZ4 and F2FS_FS_LZ4 depends on F2FS_FS_COMPRESSION, so no need to make F2FS_FS_LZ4HC depends on F2FS_FS_COMPRESSION explicitly, remove the redudant "depends on", do the similar thing for F2FS_FS_LZORLE. At the same time, it is better to move F2FS_FS_LZORLE next to F2FS_FS_LZO, it looks like a little more clear when make menuconfig, the location of "LZO-RLE compression support" is under "LZO compression support" instead of "F2FS compression feature". Without this patch: F2FS compression feature LZO compression support LZ4 compression support LZ4HC compression support ZSTD compression support LZO-RLE compression support With this patch: F2FS compression feature LZO compression support LZO-RLE compression support LZ4 compression support LZ4HC compression support ZSTD compression support Signed-off-by: Tiezhu Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 7669de7b49ce..2ac026fc3564 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -105,6 +105,13 @@ config F2FS_FS_LZO help Support LZO compress algorithm, if unsure, say Y. +config F2FS_FS_LZORLE + bool "LZO-RLE compression support" + depends on F2FS_FS_LZO + default y + help + Support LZO-RLE compress algorithm, if unsure, say Y. + config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION @@ -114,7 +121,6 @@ config F2FS_FS_LZ4 config F2FS_FS_LZ4HC bool "LZ4HC compression support" - depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZ4 default y help @@ -127,11 +133,3 @@ config F2FS_FS_ZSTD default y help Support ZSTD compress algorithm, if unsure, say Y. - -config F2FS_FS_LZORLE - bool "LZO-RLE compression support" - depends on F2FS_FS_COMPRESSION - depends on F2FS_FS_LZO - default y - help - Support LZO-RLE compress algorithm, if unsure, say Y. -- cgit v1.2.3-59-g8ed1b From 94afd6d6e5253179c9b891d02081cc8355a11768 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Aug 2021 10:23:48 +0800 Subject: f2fs: extent cache: support unaligned extent Compressed inode may suffer read performance issue due to it can not use extent cache, so I propose to add this unaligned extent support to improve it. Currently, it only works in readonly format f2fs image. Unaligned extent: in one compressed cluster, physical block number will be less than logical block number, so we add an extra physical block length in extent info in order to indicate such extent status. The idea is if one whole cluster blocks are contiguous physically, once its mapping info was readed at first time, we will cache an unaligned (or aligned) extent info entry in extent cache, it expects that the mapping info will be hitted when rereading cluster. Merge policy: - Aligned extents can be merged. - Aligned extent and unaligned extent can not be merged. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 24 ++++++++++++++++++++++++ fs/f2fs/data.c | 38 +++++++++++++++++++++++++++----------- fs/f2fs/extent_cache.c | 41 +++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 20 ++++++++++++++++++++ fs/f2fs/node.c | 20 ++++++++++++++++++++ 5 files changed, 132 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 455561826c7d..7dbfd6965b97 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1666,6 +1666,30 @@ void f2fs_put_page_dic(struct page *page) f2fs_put_dic(dic); } +/* + * check whether cluster blocks are contiguous, and add extent cache entry + * only if cluster blocks are logically and physically contiguous. + */ +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) +{ + bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR; + int i = compressed ? 1 : 0; + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) + return 0; + } + + return compressed ? i - 1 : i; +} + const struct address_space_operations f2fs_compress_aops = { .releasepage = f2fs_release_page, .invalidatepage = f2fs_invalidate_page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 948083c88d17..df5e8d8c654e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1135,7 +1135,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -1152,7 +1152,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err; page = f2fs_grab_cache_page(mapping, index, for_write); @@ -1450,7 +1450,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; @@ -2126,6 +2126,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; + struct extent_info ei = {0, }; + bool from_dnode = true; int i; int ret = 0; @@ -2156,6 +2158,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; + if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + from_dnode = false; + + if (!from_dnode) + goto skip_reading_dnode; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) @@ -2163,11 +2171,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); +skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = data_blkaddr(dn.inode, dn.node_page, - dn.ofs_in_node + i); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i) : + ei.blk + i - 1; if (!__is_valid_data_blkaddr(blkaddr)) break; @@ -2177,6 +2187,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } cc->nr_cpages++; + + if (!from_dnode && i >= ei.c_len) + break; } /* nothing to decompress */ @@ -2196,8 +2209,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, block_t blkaddr; struct bio_post_read_ctx *ctx; - blkaddr = data_blkaddr(dn.inode, dn.node_page, - dn.ofs_in_node + i + 1); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1) : + ei.blk + i; f2fs_wait_on_block_writeback(inode, blkaddr); @@ -2242,13 +2256,15 @@ submit_and_realloc: *last_block_in_bio = blkaddr; } - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); *bio_ret = bio; return 0; out_put_dnode: - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { @@ -2543,7 +2559,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; struct node_info ni; bool ipu_force = false; int err = 0; @@ -3218,7 +3234,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err = 0; int flag; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 3ebf976a682d..b120589d8517 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -661,6 +661,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } +#ifdef CONFIG_F2FS_FS_COMPRESSION +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + bool leftmost = false; + + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + + /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return; + + write_lock(&et->lock); + + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false, + &leftmost); + if (en) + goto unlock_out; + + set_extent_info(&ei, fofs, blkaddr, llen); + ei.c_len = c_len; + + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +unlock_out: + write_unlock(&et->lock); +} +#endif + unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *et, *next; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d24fd5045712..e97b4d8c5efc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -580,6 +580,9 @@ struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ u32 blk; /* start block address of the extent */ +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned int c_len; /* physical extent length of compressed blocks */ +#endif }; struct extent_node { @@ -799,6 +802,9 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->fofs = fofs; ei->blk = blk; ei->len = len; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif } static inline bool __is_discard_mergeable(struct discard_info *back, @@ -823,6 +829,12 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif return (back->fofs + back->len == front->fofs && back->blk + back->len == front->blk); } @@ -4068,12 +4080,16 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -4128,6 +4144,7 @@ static inline void f2fs_put_page_dic(struct page *page) { WARN_ON_ONCE(1); } +static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } @@ -4143,6 +4160,9 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) +static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) { } #endif static inline void set_compress_context(struct inode *inode) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5840b82ce311..9d838a7929fb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -841,6 +841,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; dn->data_blkaddr = f2fs_data_blkaddr(dn); + + if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && + f2fs_sb_has_readonly(sbi)) { + unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn); + block_t blkaddr; + + if (!c_len) + goto out; + + blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == COMPRESS_ADDR) + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + 1); + + f2fs_update_extent_tree_range_compressed(dn->inode, + index, blkaddr, + F2FS_I(dn->inode)->i_cluster_size, + c_len); + } +out: return 0; release_pages: -- cgit v1.2.3-59-g8ed1b From 4b10651864429386ae931167d3518f37a8c762e0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 08:05:58 +0800 Subject: f2fs: avoid unneeded memory allocation in __add_ino_entry() __add_ino_entry() will allocate slab cache even if we have already cached ino entry in radix tree, e.g. for case of multiple devices. Let's check radix tree first under protection of rcu lock to see whether we need to do slab allocation, it will mitigate memory pressure from "f2fs_ino_entry" slab cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7f6745f4630e..5b6ddeae1107 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -465,16 +465,28 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; - struct ino_entry *e, *tmp; + struct ino_entry *e = NULL, *new = NULL; + + if (type == FLUSH_INO) { + rcu_read_lock(); + e = radix_tree_lookup(&im->ino_root, ino); + rcu_read_unlock(); + } - tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); +retry: + if (!e) + new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { - e = tmp; + if (!new) { + spin_unlock(&im->ino_lock); + goto retry; + } + e = new; if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) f2fs_bug_on(sbi, 1); @@ -492,8 +504,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, spin_unlock(&im->ino_lock); radix_tree_preload_end(); - if (e != tmp) - kmem_cache_free(ino_entry_slab, tmp); + if (new && e != new) + kmem_cache_free(ino_entry_slab, new); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) -- cgit v1.2.3-59-g8ed1b From 65ddf6564843890a58ee3b18bb46ce67d96333fb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 08:04:37 +0800 Subject: f2fs: fix to do sanity check for sb/cp fields correctly This patch fixes below problems of sb/cp sanity check: - in sanity_check_raw_superi(), it missed to consider log header blocks while cp_payload check. - in f2fs_sanity_check_ckpt(), it missed to check nat_bits_blocks. Cc: Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 84cd085020cd..9e0e3c998142 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3264,11 +3264,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (le32_to_cpu(raw_super->cp_payload) > - (blocks_per_seg - F2FS_CP_PACKS)) { - f2fs_info(sbi, "Insane cp_payload (%u > %u)", + if (le32_to_cpu(raw_super->cp_payload) >= + (blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE)) { + f2fs_info(sbi, "Insane cp_payload (%u >= %u)", le32_to_cpu(raw_super->cp_payload), - blocks_per_seg - F2FS_CP_PACKS); + blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE); return -EFSCORRUPTED; } @@ -3304,6 +3306,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count, valid_user_blocks; block_t avail_node_count, valid_node_count; + unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; int i, j; total = le32_to_cpu(raw_super->segment_count); @@ -3434,6 +3437,17 @@ skip_cross: return 1; } + nat_blocks = nat_segs << log_blocks_per_seg; + nat_bits_bytes = nat_blocks / BITS_PER_BYTE; + nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) && + (cp_payload + F2FS_CP_PACKS + + NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { + f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", + cp_payload, nat_bits_blocks); + return -EFSCORRUPTED; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_err(sbi, "A bug case: need to run fsck"); return 1; -- cgit v1.2.3-59-g8ed1b From a2649315bcb88a136ce978a06af8aa23ea8b4154 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Thu, 12 Aug 2021 19:36:41 +0800 Subject: f2fs: compress: avoid duplicate counting of valid blocks when read compressed file Since cluster is basic unit of compression, one cluster is compressed or not, so we can calculate valid blocks only for first page in cluster, the other pages just skip. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index df5e8d8c654e..cec084806725 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2299,6 +2299,7 @@ static int f2fs_mpage_readpages(struct inode *inode, .nr_rpages = 0, .nr_cpages = 0, }; + pgoff_t nc_cluster_idx = NULL_CLUSTER; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; @@ -2331,12 +2332,23 @@ static int f2fs_mpage_readpages(struct inode *inode, if (ret) goto set_error_page; } - ret = f2fs_is_compressed_cluster(inode, page->index); - if (ret < 0) - goto set_error_page; - else if (!ret) - goto read_single_page; + if (cc.cluster_idx == NULL_CLUSTER) { + if (nc_cluster_idx == + page->index >> cc.log_cluster_size) { + goto read_single_page; + } + + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) { + nc_cluster_idx = + page->index >> cc.log_cluster_size; + goto read_single_page; + } + nc_cluster_idx = NULL_CLUSTER; + } ret = f2fs_init_compress_ctx(&cc); if (ret) goto set_error_page; -- cgit v1.2.3-59-g8ed1b From b6d9246d0315c7379d06513cb8713534b7a4734f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 11 Aug 2021 12:45:57 +0800 Subject: f2fs: improve sbi status info in debugfs/f2fs/status Do not use numbers but strings to improve readability when flag is set. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 473ad04d1891..d8c09346545d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -323,22 +323,43 @@ get_cache: #endif } +static char *s_flag[] = { + [SBI_IS_DIRTY] = " fs_dirty", + [SBI_IS_CLOSE] = " closing", + [SBI_NEED_FSCK] = " need_fsck", + [SBI_POR_DOING] = " recovering", + [SBI_NEED_SB_WRITE] = " sb_dirty", + [SBI_NEED_CP] = " need_cp", + [SBI_IS_SHUTDOWN] = " shutdown", + [SBI_IS_RECOVERED] = " recovered", + [SBI_CP_DISABLED] = " cp_disabled", + [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", + [SBI_IS_RESIZEFS] = " resizefs", +}; + static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; - int i = 0; - int j; + int i = 0, j = 0; mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s (sbi: 0x%lx)]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, f2fs_readonly(si->sbi->sb) ? "RO": "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"), - si->sbi->s_flag); + "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); + if (si->sbi->s_flag) { + seq_puts(s, "[SBI:"); + for_each_set_bit(j, &si->sbi->s_flag, 32) + seq_puts(s, s_flag[j]); + seq_puts(s, "]\n"); + } seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", -- cgit v1.2.3-59-g8ed1b From 1927ccdb79906e04c760ed7429c30a5c8054d1a7 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 10 Aug 2021 21:27:06 +0800 Subject: f2fs: correct comment in segment.h s/two/three Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 050230c70a53..89fff258727d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -142,7 +142,7 @@ enum { }; /* - * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into @@ -155,7 +155,7 @@ enum { }; /* - * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. * GC_AT is based on age-threshold algorithm. -- cgit v1.2.3-59-g8ed1b From 4a4fc043f594d39edc976a3a3dce7c40ebb86f3f Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Mon, 9 Aug 2021 10:21:04 +0800 Subject: f2fs: compress: allow write compress released file after truncate to zero For compressed file, after release compress blocks, don't allow write direct, but we should allow write direct after truncate to zero. Reviewed-by: Chao Yu Signed-off-by: Fengnan Chang Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 7 +++++-- fs/f2fs/file.c | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 8f251a662542..9b0517d90063 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -865,8 +865,11 @@ Compression implementation directly in order to guarantee potential data updates later to the space. Instead, the main goal is to reduce data writes to flash disk as much as possible, resulting in extending disk life time as well as relaxing IO - congestion. Alternatively, we've added ioctl interface to reclaim compressed - space and show it to user after putting the immutable bit. + congestion. Alternatively, we've added ioctl(F2FS_IOC_RELEASE_COMPRESS_BLOCKS) + interface to reclaim compressed space and show it to user after putting the + immutable bit. Immutable bit, after release, it doesn't allow writing/mmaping + on the file, until reserving compressed space via + ioctl(F2FS_IOC_RESERVE_COMPRESS_BLOCKS) or truncating filesize to zero. Compress metadata layout:: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7d8ee60f6c1f..d4fc5e0d2ffe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -753,6 +753,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) return err; #ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * For compressed file, after release compress blocks, don't allow write + * direct, but we should allow write direct after truncate to zero. + */ + if (f2fs_compressed_file(inode) && !free_from + && is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + if (from != free_from) { err = f2fs_truncate_partial_cluster(inode, from, lock); if (err) -- cgit v1.2.3-59-g8ed1b From 324105775c1982aacbd2972b7024d8cc06474abe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Aug 2021 08:24:48 +0800 Subject: f2fs: support fault injection for f2fs_kmem_cache_alloc() This patch supports to inject fault into f2fs_kmem_cache_alloc(). Usage: a) echo 32768 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=32768 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/compress.c | 8 +++++--- fs/f2fs/data.c | 2 +- fs/f2fs/dir.c | 4 ++-- fs/f2fs/extent_cache.c | 5 +++-- fs/f2fs/f2fs.h | 17 ++++++++++++++++- fs/f2fs/gc.c | 6 ++++-- fs/f2fs/node.c | 23 ++++++++++++----------- fs/f2fs/recovery.c | 3 ++- fs/f2fs/segment.c | 10 ++++++---- fs/f2fs/super.c | 4 +++- fs/f2fs/xattr.c | 3 ++- 13 files changed, 59 insertions(+), 30 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 9b0517d90063..21d40e3cfd7a 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -195,6 +195,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_CHECKPOINT 0x000001000 FAULT_DISCARD 0x000002000 FAULT_WRITE_IO 0x000004000 + FAULT_SLAB_ALLOC 0x000008000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5b6ddeae1107..41960c55c343 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -475,7 +475,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, retry: if (!e) - new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); + new = f2fs_kmem_cache_alloc(ino_entry_slab, + GFP_NOFS, true, NULL); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 7dbfd6965b97..afb79480c6a3 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -28,7 +28,8 @@ static void *page_array_alloc(struct inode *inode, int nr) unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) - return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS); + return f2fs_kmem_cache_alloc(sbi->page_array_slab, + GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); return f2fs_kzalloc(sbi, size, GFP_NOFS); } @@ -1228,7 +1229,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, fio.version = ni.version; - cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS); + cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!cic) goto out_put_dnode; @@ -1506,7 +1507,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) pgoff_t start_idx = start_idx_of_cluster(cc); int i; - dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, + false, F2FS_I_SB(cc->inode)); if (!dic) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cec084806725..12cd63603137 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -724,7 +724,7 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; - be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); be->bio = bio; bio_get(bio); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c250bf46ef5e..1820e9c106f7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -83,8 +83,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, struct super_block *sb = dir->i_sb; if (IS_CASEFOLDED(dir)) { - fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab, - GFP_NOFS); + fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) return -ENOMEM; fname->cf_name.len = utf8_casefold(sb->s_encoding, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index b120589d8517..866e72b29bd5 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -239,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, { struct extent_node *en; - en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); if (!en) return NULL; @@ -292,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { - et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + et = f2fs_kmem_cache_alloc(extent_tree_slab, + GFP_NOFS, true, NULL); f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e97b4d8c5efc..13a7cfe9b23f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -53,6 +53,7 @@ enum { FAULT_CHECKPOINT, FAULT_DISCARD, FAULT_WRITE_IO, + FAULT_SLAB_ALLOC, FAULT_MAX, }; @@ -2618,7 +2619,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } -static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, +static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, gfp_t flags) { void *entry; @@ -2629,6 +2630,20 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) +{ + if (nofail) + return f2fs_kmem_cache_alloc_nofail(cachep, flags); + + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC); + return NULL; + } + + return kmem_cache_alloc(cachep, flags); +} + static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) { if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9dce44619069..3bc0f0162e31 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -371,7 +371,8 @@ static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, struct atgc_management *am = &sbi->am; struct victim_entry *ve; - ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS); + ve = f2fs_kmem_cache_alloc(victim_entry_slab, + GFP_NOFS, true, NULL); ve->mtime = mtime; ve->segno = segno; @@ -849,7 +850,8 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) iput(inode); return; } - new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, + GFP_NOFS, true, NULL); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9d838a7929fb..161173de5a2d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -162,14 +162,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, + nid_t nid, bool no_fail) { struct nat_entry *new; - if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); - else - new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_F2FS_ZERO, no_fail, sbi); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); @@ -242,7 +241,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, + GFP_NOFS, true, NULL); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -329,7 +329,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, unsigned long flags; unsigned int seq_id; - fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, + GFP_NOFS, true, NULL); get_page(page); fn->page = page; @@ -428,7 +429,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; - new = __alloc_nat_entry(nid, false); + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; @@ -451,7 +452,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - struct nat_entry *new = __alloc_nat_entry(ni->nid, true); + struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); @@ -2252,7 +2253,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, if (unlikely(f2fs_check_nid_range(sbi, nid))) return false; - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL); i->nid = nid; i->state = FREE_NID; @@ -2842,7 +2843,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = __alloc_nat_entry(nid, true); + ne = __alloc_nat_entry(sbi, nid, true); __init_nat_entry(nm_i, ne, &raw_ne, true); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 695eacfe776c..04655511d7f5 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -91,7 +91,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, goto err_out; } - entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, + GFP_F2FS_ZERO, true, NULL); entry->inode = inode; list_add_tail(&entry->list, head); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ca9876a6d396..b4dd22134a73 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -188,7 +188,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) set_page_private_atomic(page); - new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); + new = f2fs_kmem_cache_alloc(inmem_entry_slab, + GFP_NOFS, true, NULL); /* add atomic page indices to the list */ new->page = page; @@ -1001,7 +1002,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, pend_list = &dcc->pend_list[plist_idx(len)]; - dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->lstart = lstart; @@ -1962,7 +1963,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, - GFP_F2FS_ZERO); + GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } @@ -4099,7 +4100,8 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); + f2fs_kmem_cache_alloc(sit_entry_set_slab, + GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9e0e3c998142..b556ca38f0fb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -56,6 +56,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_CHECKPOINT] = "checkpoint error", [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -1300,7 +1301,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); + fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep, + GFP_F2FS_ZERO, false, F2FS_SB(sb)); if (!fi) return NULL; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c8f34decbf8e..1d2d29dcd41c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -27,7 +27,8 @@ static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) { if (likely(size == sbi->inline_xattr_slab_size)) { *is_inline = true; - return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS); + return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab, + GFP_F2FS_ZERO, false, sbi); } *is_inline = false; return f2fs_kzalloc(sbi, size, GFP_NOFS); -- cgit v1.2.3-59-g8ed1b From b96d9b3b09f0427b289332c6f6bfbf747a19b654 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 10:45:20 +0800 Subject: f2fs: fix to keep compatibility of fault injection interface The value of FAULT_* macros and its description in f2fs.rst became inconsistent, fix this to keep compatibility of fault injection interface. Fixes: 67883ade7a98 ("f2fs: remove FAULT_ALLOC_BIO") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/f2fs.h | 1 + 2 files changed, 2 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 21d40e3cfd7a..09de6ebbbdfa 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -185,6 +185,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 (obsolete) FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 13a7cfe9b23f..4b6ea498a1e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -43,6 +43,7 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, + FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, -- cgit v1.2.3-59-g8ed1b From 491f7f71e18411463f348beac1b6d99eb8f2c6c1 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 14 Aug 2021 18:37:01 +0800 Subject: f2fs: convert S_IRUGO to 0444 To fix: WARNING: Symbolic permissions 'S_IRUGO' are not preferred. Consider using octal permissions '0444'. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/sysfs.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d8c09346545d..8c50518475a9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -642,7 +642,7 @@ void __init f2fs_create_root_stats(void) #ifdef CONFIG_DEBUG_FS f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL, &stat_fops); #endif } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0954761341d7..36d7e40bf12e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1253,13 +1253,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (sbi->s_proc) { - proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_info", 0444, sbi->s_proc, segment_info_seq_show, sb); - proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); - proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, + proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); - proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, + proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); } return 0; -- cgit v1.2.3-59-g8ed1b From b35d71b969096422f1c57d7c1394fe8b1cad9710 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sun, 15 Aug 2021 01:58:40 +0800 Subject: f2fs: fix description about main_blkaddr node Don't leave a blank line, to keep the style consistent with other node descriptions. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 73211f77d11e..f627e705e663 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -41,8 +41,7 @@ Description: This parameter controls the number of prefree segments to be What: /sys/fs/f2fs//main_blkaddr Date: November 2019 Contact: "Ramon Pantin" -Description: - Shows first block address of MAIN area. +Description: Shows first block address of MAIN area. What: /sys/fs/f2fs//ipu_policy Date: November 2013 -- cgit v1.2.3-59-g8ed1b From bbe1da7e34ac5a830163bfdfa09cbe3dadfda3ce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 08:02:50 +0800 Subject: f2fs: compress: do sanity check on cluster This patch adds f2fs_sanity_check_cluster() to support doing sanity check on cluster of compressed file, it will be triggered from below two paths: - __f2fs_cluster_blocks() - f2fs_map_blocks(F2FS_GET_BLOCK_FIEMAP) And it can detect below three kind of cluster insanity status. C: COMPRESS_ADDR N: NULL_ADDR or NEW_ADDR V: valid blkaddr *: any value 1. [*|C|*|*] 2. [C|*|C|*] 3. [C|N|N|V] Signed-off-by: Chao Yu [Nathan Chancellor: fix missing inline warning] Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/data.c | 7 +++++++ fs/f2fs/f2fs.h | 2 ++ 3 files changed, 62 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index afb79480c6a3..ec70a0a32327 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -899,6 +899,54 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc) return false; } +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool compressed = dn->data_blkaddr == COMPRESS_ADDR; + int cluster_end = 0; + int i; + char *reason = ""; + + if (!compressed) + return false; + + /* [..., COMPR_ADDR, ...] */ + if (dn->ofs_in_node % cluster_size) { + reason = "[*|C|*|*]"; + goto out; + } + + for (i = 1; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + /* [COMPR_ADDR, ..., COMPR_ADDR] */ + if (blkaddr == COMPRESS_ADDR) { + reason = "[C|*|C|*]"; + goto out; + } + if (compressed) { + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; + } + } + } + return false; +out: + f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return true; +} + static int __f2fs_cluster_blocks(struct inode *inode, unsigned int cluster_idx, bool compr) { @@ -916,6 +964,11 @@ static int __f2fs_cluster_blocks(struct inode *inode, goto fail; } + if (f2fs_sanity_check_cluster(&dn)) { + ret = -EFSCORRUPTED; + goto fail; + } + if (dn.data_blkaddr == COMPRESS_ADDR) { int i; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 12cd63603137..e4e4eb800d2b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1552,6 +1552,13 @@ next_block: map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn) && + (flag != F2FS_GET_BLOCK_FIEMAP || + IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + err = -EFSCORRUPTED; + goto sync_out; + } if (flag == F2FS_GET_BLOCK_BMAP) { map->m_pblk = 0; goto sync_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b6ea498a1e0..fe5f280f6ac0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4090,6 +4090,7 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, @@ -4161,6 +4162,7 @@ static inline void f2fs_put_page_dic(struct page *page) WARN_ON_ONCE(1); } static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } +static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } -- cgit v1.2.3-59-g8ed1b From 521187439abfb3e1c946796dc2187c443e5457ab Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 19 Aug 2021 20:52:28 -0700 Subject: f2fs: separate out iostat feature Added F2FS_IOSTAT config option to support getting IO statistics through sysfs and printing out periodic IO statistics tracepoint events and moved I/O statistics related codes into separate files for better maintenance. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu [Jaegeuk Kim: set default=y] Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 9 +++ fs/f2fs/Makefile | 1 + fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 1 + fs/f2fs/f2fs.h | 59 +++-------------- fs/f2fs/file.c | 1 + fs/f2fs/gc.c | 1 + fs/f2fs/iostat.c | 154 ++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/iostat.h | 27 ++++++++ fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 1 + fs/f2fs/super.c | 10 +-- fs/f2fs/sysfs.c | 106 ++++-------------------------- include/trace/events/f2fs.h | 2 + 14 files changed, 225 insertions(+), 149 deletions(-) create mode 100644 fs/f2fs/iostat.c create mode 100644 fs/f2fs/iostat.h diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 2ac026fc3564..7eea3cfd894d 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -133,3 +133,12 @@ config F2FS_FS_ZSTD default y help Support ZSTD compress algorithm, if unsure, say Y. + +config F2FS_IOSTAT + bool "F2FS IO statistics information" + depends on F2FS_FS + default y + help + Support getting IO statistics through sysfs and printing out periodic + IO statistics tracepoint events. You have to turn on "iostat_enable" + sysfs node to enable this feature. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index e5295746208b..8a7322d229e4 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o +f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 41960c55c343..3962cfeb4a57 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -18,6 +18,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e4e4eb800d2b..fd16c4fc4507 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,6 +25,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include #define NUM_PREALLOC_POST_READ_CTXS 128 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fe5f280f6ac0..12ecf6ee9cb5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1713,14 +1713,6 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ - /* For app/fs IO statistics */ - spinlock_t iostat_lock; - unsigned long long rw_iostat[NR_IO_TYPE]; - unsigned long long prev_rw_iostat[NR_IO_TYPE]; - bool iostat_enable; - unsigned long iostat_next_period; - unsigned int iostat_period_ms; - /* to attach REQ_META|REQ_FUA flags */ unsigned int data_io_flag; unsigned int node_io_flag; @@ -1780,6 +1772,16 @@ struct f2fs_sb_info { unsigned int compress_watermark; /* cache page watermark */ atomic_t compress_page_hit; /* cache hit count */ #endif + +#ifdef CONFIG_F2FS_IOSTAT + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long rw_iostat[NR_IO_TYPE]; + unsigned long long prev_rw_iostat[NR_IO_TYPE]; + bool iostat_enable; + unsigned long iostat_next_period; + unsigned int iostat_period_ms; +#endif }; struct f2fs_private_dio { @@ -3257,47 +3259,6 @@ static inline int get_inline_xattr_addrs(struct inode *inode) sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ -#define DEFAULT_IOSTAT_PERIOD_MS 3000 -#define MIN_IOSTAT_PERIOD_MS 100 -/* maximum period of iostat tracing is 1 day */ -#define MAX_IOSTAT_PERIOD_MS 8640000 - -static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) -{ - int i; - - spin_lock(&sbi->iostat_lock); - for (i = 0; i < NR_IO_TYPE; i++) { - sbi->rw_iostat[i] = 0; - sbi->prev_rw_iostat[i] = 0; - } - spin_unlock(&sbi->iostat_lock); -} - -extern void f2fs_record_iostat(struct f2fs_sb_info *sbi); - -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, - enum iostat_type type, unsigned long long io_bytes) -{ - if (!sbi->iostat_enable) - return; - spin_lock(&sbi->iostat_lock); - sbi->rw_iostat[type] += io_bytes; - - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_BUFFERED_IO] = - sbi->rw_iostat[APP_WRITE_IO] - - sbi->rw_iostat[APP_DIRECT_IO]; - - if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_BUFFERED_READ_IO] = - sbi->rw_iostat[APP_READ_IO] - - sbi->rw_iostat[APP_DIRECT_READ_IO]; - spin_unlock(&sbi->iostat_lock); - - f2fs_record_iostat(sbi); -} - #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d4fc5e0d2ffe..ab4ea2ddcc8b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -31,6 +31,7 @@ #include "xattr.h" #include "acl.h" #include "gc.h" +#include "iostat.h" #include #include diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3bc0f0162e31..2c18443972b6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -19,6 +19,7 @@ #include "node.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include static struct kmem_cache *victim_entry_slab; diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c new file mode 100644 index 000000000000..21c29e121a86 --- /dev/null +++ b/fs/f2fs/iostat.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs iostat support + * + * Copyright 2021 Google LLC + * Author: Daeho Jeong + */ + +#include +#include +#include + +#include "f2fs.h" +#include "iostat.h" +#include + +int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app write IOs */ + seq_puts(seq, "[WRITE]\n"); + seq_printf(seq, "app buffered: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_IO]); + + /* print fs write IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->rw_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->rw_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->rw_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->rw_iostat[FS_CP_META_IO]); + + /* print app read IOs */ + seq_puts(seq, "[READ]\n"); + seq_printf(seq, "app buffered: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_READ_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_READ_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_READ_IO]); + + /* print fs read IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_READ_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GDATA_READ_IO]); + seq_printf(seq, "fs compr_data: %-16llu\n", + sbi->rw_iostat[FS_CDATA_READ_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_READ_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_READ_IO]); + + /* print other IOs */ + seq_puts(seq, "[OTHER]\n"); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->rw_iostat[FS_DISCARD]); + + return 0; +} + +static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) +{ + unsigned long long iostat_diff[NR_IO_TYPE]; + int i; + + if (time_is_after_jiffies(sbi->iostat_next_period)) + return; + + /* Need double check under the lock */ + spin_lock(&sbi->iostat_lock); + if (time_is_after_jiffies(sbi->iostat_next_period)) { + spin_unlock(&sbi->iostat_lock); + return; + } + sbi->iostat_next_period = jiffies + + msecs_to_jiffies(sbi->iostat_period_ms); + + for (i = 0; i < NR_IO_TYPE; i++) { + iostat_diff[i] = sbi->rw_iostat[i] - + sbi->prev_rw_iostat[i]; + sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; + } + spin_unlock(&sbi->iostat_lock); + + trace_f2fs_iostat(sbi, iostat_diff); +} + +void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) { + sbi->rw_iostat[i] = 0; + sbi->prev_rw_iostat[i] = 0; + } + spin_unlock(&sbi->iostat_lock); +} + +void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + + spin_lock(&sbi->iostat_lock); + sbi->rw_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_BUFFERED_IO] = + sbi->rw_iostat[APP_WRITE_IO] - + sbi->rw_iostat[APP_DIRECT_IO]; + + if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_BUFFERED_READ_IO] = + sbi->rw_iostat[APP_READ_IO] - + sbi->rw_iostat[APP_DIRECT_READ_IO]; + spin_unlock(&sbi->iostat_lock); + + f2fs_record_iostat(sbi); +} + +int f2fs_init_iostat(struct f2fs_sb_info *sbi) +{ + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + + return 0; +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h new file mode 100644 index 000000000000..46e4a36fc8e9 --- /dev/null +++ b/fs/f2fs/iostat.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + * Author: Daeho Jeong + */ +#ifndef __F2FS_IOSTAT_H__ +#define __F2FS_IOSTAT_H__ + +#ifdef CONFIG_F2FS_IOSTAT + +#define DEFAULT_IOSTAT_PERIOD_MS 3000 +#define MIN_IOSTAT_PERIOD_MS 100 +/* maximum period of iostat tracing is 1 day */ +#define MAX_IOSTAT_PERIOD_MS 8640000 + +extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset); +extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes); +extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +#else +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) {} +static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +#endif +#endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 161173de5a2d..043cb831b289 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,6 +17,7 @@ #include "node.h" #include "segment.h" #include "xattr.h" +#include "iostat.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b4dd22134a73..73abec9988e9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,6 +20,7 @@ #include "segment.h" #include "node.h" #include "gc.h" +#include "iostat.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b556ca38f0fb..a23926d1a77b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -33,6 +33,7 @@ #include "segment.h" #include "xattr.h" #include "gc.h" +#include "iostat.h" #define CREATE_TRACE_POINTS #include @@ -3964,11 +3965,6 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - /* init iostat info */ - spin_lock_init(&sbi->iostat_lock); - sbi->iostat_enable = false; - sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; - for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; @@ -3999,6 +3995,10 @@ try_onemore: init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = f2fs_init_iostat(sbi); + if (err) + goto free_bio_info; + err = init_percpu_info(sbi); if (err) goto free_bio_info; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 36d7e40bf12e..a1a3e0f6d658 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -17,6 +17,7 @@ #include "f2fs.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include static struct proc_dir_entry *f2fs_proc_root; @@ -477,6 +478,7 @@ out: return count; } +#ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; if (!sbi->iostat_enable) @@ -492,6 +494,7 @@ out: spin_unlock(&sbi->iostat_lock); return count; } +#endif #ifdef CONFIG_F2FS_FS_COMPRESSION if (!strcmp(a->attr.name, "compr_written_block") || @@ -700,8 +703,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +#ifdef CONFIG_F2FS_IOSTAT F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); +#endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -807,8 +812,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), +#ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), +#endif ATTR_LIST(readdir_ra), ATTR_LIST(max_io_bytes), ATTR_LIST(gc_pin_file_thresh), @@ -1076,101 +1083,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, return 0; } -void f2fs_record_iostat(struct f2fs_sb_info *sbi) -{ - unsigned long long iostat_diff[NR_IO_TYPE]; - int i; - - if (time_is_after_jiffies(sbi->iostat_next_period)) - return; - - /* Need double check under the lock */ - spin_lock(&sbi->iostat_lock); - if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock(&sbi->iostat_lock); - return; - } - sbi->iostat_next_period = jiffies + - msecs_to_jiffies(sbi->iostat_period_ms); - - for (i = 0; i < NR_IO_TYPE; i++) { - iostat_diff[i] = sbi->rw_iostat[i] - - sbi->prev_rw_iostat[i]; - sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; - } - spin_unlock(&sbi->iostat_lock); - - trace_f2fs_iostat(sbi, iostat_diff); -} - -static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, - void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - time64_t now = ktime_get_real_seconds(); - - if (!sbi->iostat_enable) - return 0; - - seq_printf(seq, "time: %-16llu\n", now); - - /* print app write IOs */ - seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_IO]); - - /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", - sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", - sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", - sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", - sbi->rw_iostat[FS_CP_META_IO]); - - /* print app read IOs */ - seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_READ_IO]); - - /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", - sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_READ_IO]); - - /* print other IOs */ - seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", - sbi->rw_iostat[FS_DISCARD]); - - return 0; -} - static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, void *offset) { @@ -1257,8 +1169,10 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) segment_info_seq_show, sb); proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); +#ifdef CONFIG_F2FS_IOSTAT proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); +#endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); } @@ -1278,7 +1192,9 @@ put_sb_kobj: void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { if (sbi->s_proc) { +#ifdef CONFIG_F2FS_IOSTAT remove_proc_entry("iostat_info", sbi->s_proc); +#endif remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 56b113e3cd6a..3eaf19aa89af 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1818,6 +1818,7 @@ DEFINE_EVENT(f2fs_zip_end, f2fs_decompress_pages_end, TP_ARGS(inode, cluster_idx, compressed_size, ret) ); +#ifdef CONFIG_F2FS_IOSTAT TRACE_EVENT(f2fs_iostat, TP_PROTO(struct f2fs_sb_info *sbi, unsigned long long *iostat), @@ -1893,6 +1894,7 @@ TRACE_EVENT(f2fs_iostat, __entry->app_mrio, __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); +#endif TRACE_EVENT(f2fs_bmap, -- cgit v1.2.3-59-g8ed1b From a4b6817625e71d5d4aee16cacf7a7fec077c6dbe Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 20 Aug 2021 15:29:09 -0700 Subject: f2fs: introduce periodic iostat io latency traces Whenever we notice some sluggish issues on our machines, we are always curious about how well all types of I/O in the f2fs filesystem are handled. But, it's hard to get this kind of real data. First of all, we need to reproduce the issue while turning on the profiling tool like blktrace, but the issue doesn't happen again easily. Second, with the intervention of any tools, the overall timing of the issue will be slightly changed and it sometimes makes us hard to figure it out. So, I added the feature printing out IO latency statistics tracepoint events, which are minimal things to understand filesystem's I/O related behaviors, into F2FS_IOSTAT kernel config. With "iostat_enable" sysfs node on, we can get this statistics info in a periodic way and it would cause the least overhead. [samples] f2fs_ckpt-254:1-507 [003] .... 2842.439683: f2fs_iostat_latency: dev = (254,11), iotype [peak lat.(ms)/avg lat.(ms)/count], rd_data [136/1/801], rd_node [136/1/1704], rd_meta [4/2/4], wr_sync_data [164/16/3331], wr_sync_node [152/3/648], wr_sync_meta [160/2/4243], wr_async_data [24/13/15], wr_async_node [0/0/0], wr_async_meta [0/0/0] f2fs_ckpt-254:1-507 [002] .... 2845.450514: f2fs_iostat_latency: dev = (254,11), iotype [peak lat.(ms)/avg lat.(ms)/count], rd_data [60/3/456], rd_node [60/3/1258], rd_meta [0/0/1], wr_sync_data [120/12/2285], wr_sync_node [88/5/428], wr_sync_meta [52/6/2990], wr_async_data [4/1/3], wr_async_node [0/0/0], wr_async_meta [0/0/0] Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 19 +++++-- fs/f2fs/f2fs.h | 4 ++ fs/f2fs/iostat.c | 133 ++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/iostat.h | 57 +++++++++++++++++++ fs/f2fs/super.c | 13 ++++- include/trace/events/f2fs.h | 95 +++++++++++++++++++++++++++++++ 6 files changed, 315 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fd16c4fc4507..5e4120b92f59 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -271,7 +271,10 @@ static void f2fs_post_read_work(struct work_struct *work) static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); - struct bio_post_read_ctx *ctx = bio->bi_private; + struct bio_post_read_ctx *ctx; + + iostat_update_and_unbind_ctx(bio, 0); + ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); @@ -293,10 +296,13 @@ static void f2fs_read_end_io(struct bio *bio) static void f2fs_write_end_io(struct bio *bio) { - struct f2fs_sb_info *sbi = bio->bi_private; + struct f2fs_sb_info *sbi; struct bio_vec *bvec; struct bvec_iter_all iter_all; + iostat_update_and_unbind_ctx(bio, 1); + sbi = bio->bi_private; + if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; @@ -400,6 +406,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, fio->type, fio->temp); } + iostat_alloc_and_bind_ctx(sbi, bio, NULL); + if (fio->io_wbc) wbc_init_bio(fio->io_wbc, bio); @@ -481,6 +489,8 @@ submit_io: trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); + + iostat_update_submit_ctx(bio, type); submit_bio(bio); } @@ -972,7 +982,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - struct bio_post_read_ctx *ctx; + struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, @@ -1008,6 +1018,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->fs_blkaddr = blkaddr; bio->bi_private = ctx; } + iostat_alloc_and_bind_ctx(sbi, bio, ctx); return bio; } @@ -2253,7 +2264,7 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - ctx = bio->bi_private; + ctx = get_post_read_ctx(bio); ctx->enabled_steps |= STEP_DECOMPRESS; refcount_inc(&dic->refcnt); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 12ecf6ee9cb5..26d084a1fea8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1781,6 +1781,10 @@ struct f2fs_sb_info { bool iostat_enable; unsigned long iostat_next_period; unsigned int iostat_period_ms; + + /* For io latency related statistics info in one iostat period */ + spinlock_t iostat_lat_lock; + struct iostat_lat_info *iostat_io_lat; #endif }; diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index 21c29e121a86..cdcf54ae0db8 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -14,6 +14,10 @@ #include "iostat.h" #include +#define NUM_PREALLOC_IOSTAT_CTXS 128 +static struct kmem_cache *bio_iostat_ctx_cache; +static mempool_t *bio_iostat_ctx_pool; + int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; @@ -81,6 +85,32 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) return 0; } +static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) +{ + int io, idx = 0; + unsigned int cnt; + struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + + spin_lock_irq(&sbi->iostat_lat_lock); + for (idx = 0; idx < MAX_IO_TYPE; idx++) { + for (io = 0; io < NR_PAGE_TYPE; io++) { + cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].peak_lat = + jiffies_to_msecs(io_lat->peak_lat[idx][io]); + iostat_lat[idx][io].cnt = cnt; + iostat_lat[idx][io].avg_lat = cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0; + io_lat->sum_lat[idx][io] = 0; + io_lat->peak_lat[idx][io] = 0; + io_lat->bio_cnt[idx][io] = 0; + } + } + spin_unlock_irq(&sbi->iostat_lat_lock); + + trace_f2fs_iostat_latency(sbi, iostat_lat); +} + static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) { unsigned long long iostat_diff[NR_IO_TYPE]; @@ -106,10 +136,13 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) spin_unlock(&sbi->iostat_lock); trace_f2fs_iostat(sbi, iostat_diff); + + __record_iostat_latency(sbi); } void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; spin_lock(&sbi->iostat_lock); @@ -118,6 +151,10 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i] = 0; } spin_unlock(&sbi->iostat_lock); + + spin_lock_irq(&sbi->iostat_lat_lock); + memset(io_lat, 0, sizeof(struct iostat_lat_info)); + spin_unlock_irq(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, @@ -143,12 +180,108 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, f2fs_record_iostat(sbi); } +static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, + int rw, bool is_sync) +{ + unsigned long ts_diff; + unsigned int iotype = iostat_ctx->type; + unsigned long flags; + struct f2fs_sb_info *sbi = iostat_ctx->sbi; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int idx; + + if (!sbi->iostat_enable) + return; + + ts_diff = jiffies - iostat_ctx->submit_ts; + if (iotype >= META_FLUSH) + iotype = META; + + if (rw == 0) { + idx = READ_IO; + } else { + if (is_sync) + idx = WRITE_SYNC_IO; + else + idx = WRITE_ASYNC_IO; + } + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + io_lat->sum_lat[idx][iotype] += ts_diff; + io_lat->bio_cnt[idx][iotype]++; + if (ts_diff > io_lat->peak_lat[idx][iotype]) + io_lat->peak_lat[idx][iotype] = ts_diff; + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); +} + +void iostat_update_and_unbind_ctx(struct bio *bio, int rw) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + bool is_sync = bio->bi_opf & REQ_SYNC; + + if (rw == 0) + bio->bi_private = iostat_ctx->post_read_ctx; + else + bio->bi_private = iostat_ctx->sbi; + __update_iostat_latency(iostat_ctx, rw, is_sync); + mempool_free(iostat_ctx, bio_iostat_ctx_pool); +} + +void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) +{ + struct bio_iostat_ctx *iostat_ctx; + /* Due to the mempool, this never fails. */ + iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS); + iostat_ctx->sbi = sbi; + iostat_ctx->submit_ts = 0; + iostat_ctx->type = 0; + iostat_ctx->post_read_ctx = ctx; + bio->bi_private = iostat_ctx; +} + +int __init f2fs_init_iostat_processing(void) +{ + bio_iostat_ctx_cache = + kmem_cache_create("f2fs_bio_iostat_ctx", + sizeof(struct bio_iostat_ctx), 0, 0, NULL); + if (!bio_iostat_ctx_cache) + goto fail; + bio_iostat_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS, + bio_iostat_ctx_cache); + if (!bio_iostat_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_iostat_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_iostat_processing(void) +{ + mempool_destroy(bio_iostat_ctx_pool); + kmem_cache_destroy(bio_iostat_ctx_cache); +} + int f2fs_init_iostat(struct f2fs_sb_info *sbi) { /* init iostat info */ spin_lock_init(&sbi->iostat_lock); + spin_lock_init(&sbi->iostat_lat_lock); sbi->iostat_enable = false; sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info), + GFP_KERNEL); + if (!sbi->iostat_io_lat) + return -ENOMEM; return 0; } + +void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) +{ + kfree(sbi->iostat_io_lat); +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 46e4a36fc8e9..22a2d01f57ef 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -6,6 +6,8 @@ #ifndef __F2FS_IOSTAT_H__ #define __F2FS_IOSTAT_H__ +struct bio_post_read_ctx; + #ifdef CONFIG_F2FS_IOSTAT #define DEFAULT_IOSTAT_PERIOD_MS 3000 @@ -13,15 +15,70 @@ /* maximum period of iostat tracing is 1 day */ #define MAX_IOSTAT_PERIOD_MS 8640000 +enum { + READ_IO, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + +struct iostat_lat_info { + unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ + unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ + unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */ +}; + extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset); extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, enum iostat_type type, unsigned long long io_bytes); + +struct bio_iostat_ctx { + struct f2fs_sb_info *sbi; + unsigned long submit_ts; + enum page_type type; + struct bio_post_read_ctx *post_read_ctx; +}; + +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + iostat_ctx->submit_ts = jiffies; + iostat_ctx->type = type; +} + +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + return iostat_ctx->post_read_ctx; +} + +extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw); +extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx); +extern int f2fs_init_iostat_processing(void); +extern void f2fs_destroy_iostat_processing(void); extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, enum iostat_type type, unsigned long long io_bytes) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} +static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) {} +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) {} +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + return bio->bi_private; +} +static inline int f2fs_init_iostat_processing(void) { return 0; } +static inline void f2fs_destroy_iostat_processing(void) {} static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {} #endif #endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a23926d1a77b..f5148f2fd884 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1574,6 +1574,7 @@ static void f2fs_put_super(struct super_block *sb) #endif fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); destroy_percpu_info(sbi); + f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE @@ -4001,7 +4002,7 @@ try_onemore: err = init_percpu_info(sbi); if (err) - goto free_bio_info; + goto free_iostat; if (F2FS_IO_ALIGNED(sbi)) { sbi->write_io_dummy = @@ -4334,6 +4335,8 @@ free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: destroy_percpu_info(sbi); +free_iostat: + f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -4476,9 +4479,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; - err = f2fs_init_bio_entry_cache(); + err = f2fs_init_iostat_processing(); if (err) goto free_post_read; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_iostat; err = f2fs_init_bioset(); if (err) goto free_bio_enrty_cache; @@ -4500,6 +4506,8 @@ free_bioset: f2fs_destroy_bioset(); free_bio_enrty_cache: f2fs_destroy_bio_entry_cache(); +free_iostat: + f2fs_destroy_iostat_processing(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: @@ -4534,6 +4542,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); + f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3eaf19aa89af..4e881d91c874 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1894,6 +1894,101 @@ TRACE_EVENT(f2fs_iostat, __entry->app_mrio, __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); + +#ifndef __F2FS_IOSTAT_LATENCY_TYPE +#define __F2FS_IOSTAT_LATENCY_TYPE +struct f2fs_iostat_latency { + unsigned int peak_lat; + unsigned int avg_lat; + unsigned int cnt; +}; +#endif /* __F2FS_IOSTAT_LATENCY_TYPE */ + +TRACE_EVENT(f2fs_iostat_latency, + + TP_PROTO(struct f2fs_sb_info *sbi, struct f2fs_iostat_latency (*iostat_lat)[NR_PAGE_TYPE]), + + TP_ARGS(sbi, iostat_lat), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, d_rd_peak) + __field(unsigned int, d_rd_avg) + __field(unsigned int, d_rd_cnt) + __field(unsigned int, n_rd_peak) + __field(unsigned int, n_rd_avg) + __field(unsigned int, n_rd_cnt) + __field(unsigned int, m_rd_peak) + __field(unsigned int, m_rd_avg) + __field(unsigned int, m_rd_cnt) + __field(unsigned int, d_wr_s_peak) + __field(unsigned int, d_wr_s_avg) + __field(unsigned int, d_wr_s_cnt) + __field(unsigned int, n_wr_s_peak) + __field(unsigned int, n_wr_s_avg) + __field(unsigned int, n_wr_s_cnt) + __field(unsigned int, m_wr_s_peak) + __field(unsigned int, m_wr_s_avg) + __field(unsigned int, m_wr_s_cnt) + __field(unsigned int, d_wr_as_peak) + __field(unsigned int, d_wr_as_avg) + __field(unsigned int, d_wr_as_cnt) + __field(unsigned int, n_wr_as_peak) + __field(unsigned int, n_wr_as_avg) + __field(unsigned int, n_wr_as_cnt) + __field(unsigned int, m_wr_as_peak) + __field(unsigned int, m_wr_as_avg) + __field(unsigned int, m_wr_as_cnt) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + __entry->d_rd_peak = iostat_lat[0][DATA].peak_lat; + __entry->d_rd_avg = iostat_lat[0][DATA].avg_lat; + __entry->d_rd_cnt = iostat_lat[0][DATA].cnt; + __entry->n_rd_peak = iostat_lat[0][NODE].peak_lat; + __entry->n_rd_avg = iostat_lat[0][NODE].avg_lat; + __entry->n_rd_cnt = iostat_lat[0][NODE].cnt; + __entry->m_rd_peak = iostat_lat[0][META].peak_lat; + __entry->m_rd_avg = iostat_lat[0][META].avg_lat; + __entry->m_rd_cnt = iostat_lat[0][META].cnt; + __entry->d_wr_s_peak = iostat_lat[1][DATA].peak_lat; + __entry->d_wr_s_avg = iostat_lat[1][DATA].avg_lat; + __entry->d_wr_s_cnt = iostat_lat[1][DATA].cnt; + __entry->n_wr_s_peak = iostat_lat[1][NODE].peak_lat; + __entry->n_wr_s_avg = iostat_lat[1][NODE].avg_lat; + __entry->n_wr_s_cnt = iostat_lat[1][NODE].cnt; + __entry->m_wr_s_peak = iostat_lat[1][META].peak_lat; + __entry->m_wr_s_avg = iostat_lat[1][META].avg_lat; + __entry->m_wr_s_cnt = iostat_lat[1][META].cnt; + __entry->d_wr_as_peak = iostat_lat[2][DATA].peak_lat; + __entry->d_wr_as_avg = iostat_lat[2][DATA].avg_lat; + __entry->d_wr_as_cnt = iostat_lat[2][DATA].cnt; + __entry->n_wr_as_peak = iostat_lat[2][NODE].peak_lat; + __entry->n_wr_as_avg = iostat_lat[2][NODE].avg_lat; + __entry->n_wr_as_cnt = iostat_lat[2][NODE].cnt; + __entry->m_wr_as_peak = iostat_lat[2][META].peak_lat; + __entry->m_wr_as_avg = iostat_lat[2][META].avg_lat; + __entry->m_wr_as_cnt = iostat_lat[2][META].cnt; + ), + + TP_printk("dev = (%d,%d), " + "iotype [peak lat.(ms)/avg lat.(ms)/count], " + "rd_data [%u/%u/%u], rd_node [%u/%u/%u], rd_meta [%u/%u/%u], " + "wr_sync_data [%u/%u/%u], wr_sync_node [%u/%u/%u], " + "wr_sync_meta [%u/%u/%u], wr_async_data [%u/%u/%u], " + "wr_async_node [%u/%u/%u], wr_async_meta [%u/%u/%u]", + show_dev(__entry->dev), + __entry->d_rd_peak, __entry->d_rd_avg, __entry->d_rd_cnt, + __entry->n_rd_peak, __entry->n_rd_avg, __entry->n_rd_cnt, + __entry->m_rd_peak, __entry->m_rd_avg, __entry->m_rd_cnt, + __entry->d_wr_s_peak, __entry->d_wr_s_avg, __entry->d_wr_s_cnt, + __entry->n_wr_s_peak, __entry->n_wr_s_avg, __entry->n_wr_s_cnt, + __entry->m_wr_s_peak, __entry->m_wr_s_avg, __entry->m_wr_s_cnt, + __entry->d_wr_as_peak, __entry->d_wr_as_avg, __entry->d_wr_as_cnt, + __entry->n_wr_as_peak, __entry->n_wr_as_avg, __entry->n_wr_as_cnt, + __entry->m_wr_as_peak, __entry->m_wr_as_avg, __entry->m_wr_as_cnt) +); #endif TRACE_EVENT(f2fs_bmap, -- cgit v1.2.3-59-g8ed1b From 94c821fb286b545d37549ff30a0c341e066f0d6c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 20 Aug 2021 18:54:59 +0800 Subject: f2fs: rebuild nat_bits during umount If all free_nat_bitmap are available, we can rebuild nat_bits from free_nat_bitmap entirely during umount, let's make another chance to reenable nat_bits for image. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 21 ++++++++--- fs/f2fs/f2fs.h | 32 +--------------- fs/f2fs/node.c | 101 +++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 95 insertions(+), 59 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3962cfeb4a57..83e9bc0f91ff 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1303,12 +1303,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - spin_lock_irqsave(&sbi->cp_lock, flags); + if (cpc->reason & CP_UMOUNT) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) { + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to no space"); + } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && + f2fs_nat_bitmap_enabled(sbi)) { + f2fs_enable_nat_bits(sbi); + set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Rebuild and enable nat_bits"); + } + } - if ((cpc->reason & CP_UMOUNT) && - le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) - disable_nat_bits(sbi, false); + spin_lock_irqsave(&sbi->cp_lock, flags); if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); @@ -1494,7 +1502,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if (enabled_nat_bits(sbi, cpc)) { + if ((cpc->reason & CP_UMOUNT) && + is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 26d084a1fea8..30134307a404 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2073,36 +2073,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } -static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) -{ - unsigned long flags; - unsigned char *nat_bits; - - /* - * In order to re-enable nat_bits we need to call fsck.f2fs by - * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, - * so let's rely on regular fsck or unclean shutdown. - */ - - if (lock) - spin_lock_irqsave(&sbi->cp_lock, flags); - __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); - nat_bits = NM_I(sbi)->nat_bits; - NM_I(sbi)->nat_bits = NULL; - if (lock) - spin_unlock_irqrestore(&sbi->cp_lock, flags); - - kvfree(nat_bits); -} - -static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, - struct cp_control *cpc) -{ - bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - - return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -3429,6 +3399,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -3453,6 +3424,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 043cb831b289..e863136081b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2213,6 +2213,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i; + bool ret = true; + + down_read(&nm_i->nat_tree_lock); + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) { + ret = false; + break; + } + } + up_read(&nm_i->nat_tree_lock); + + return ret; +} + static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2884,7 +2902,23 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, + unsigned int valid) +{ + if (valid == 0) { + __set_bit_le(nat_ofs, nm_i->empty_nat_bits); + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_ofs, nm_i->full_nat_bits); + else + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); +} + +static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2893,7 +2927,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; if (nat_index == 0) { @@ -2904,17 +2938,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - if (valid == 0) { - __set_bit_le(nat_index, nm_i->empty_nat_bits); - __clear_bit_le(nat_index, nm_i->full_nat_bits); - return; + + __update_nat_bits(nm_i, nat_index, valid); +} + +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs; + + down_read(&nm_i->nat_tree_lock); + + for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { + unsigned int valid = 0, nid_ofs = 0; + + /* handle nid zero due to it should never be used */ + if (unlikely(nat_ofs == 0)) { + valid = 1; + nid_ofs = 1; + } + + for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { + if (!test_bit_le(nid_ofs, + nm_i->free_nid_bitmap[nat_ofs])) + valid++; + } + + __update_nat_bits(nm_i, nat_ofs, valid); } - __clear_bit_le(nat_index, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_index, nm_i->full_nat_bits); - else - __clear_bit_le(nat_index, nm_i->full_nat_bits); + up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -2933,7 +2986,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (enabled_nat_bits(sbi, cpc) || + if ((cpc->reason & CP_UMOUNT) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -2980,7 +3033,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); + update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3011,7 +3064,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (enabled_nat_bits(sbi, cpc)) { + if (cpc->reason & CP_UMOUNT) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); @@ -3027,7 +3080,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (enabled_nat_bits(sbi, cpc) || + if (cpc->reason & CP_UMOUNT || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3064,15 +3117,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; - if (!enabled_nat_bits(sbi, NULL)) - return 0; - nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return 0; + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3089,13 +3145,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - disable_nat_bits(sbi, true); + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", + cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); return 0; } - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3106,7 +3161,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; for (i = 0; i < nm_i->nat_blocks; i++) { -- cgit v1.2.3-59-g8ed1b From 4d67490498acb4ffcef5ba7bc44990d46e66a44c Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Thu, 19 Aug 2021 16:02:37 +0800 Subject: f2fs: Don't create discard thread when device doesn't support realtime discard Don't create discard thread when device doesn't support realtime discard or user specifies nodiscard mount option. Signed-off-by: Fengnan Chang Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 25 +++++++++++++++++++------ fs/f2fs/super.c | 27 ++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 30134307a404..c24f03e054cb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3448,6 +3448,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 73abec9988e9..a135d2247415 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2115,9 +2115,25 @@ wakeup: wake_up_discard_thread(sbi, false); } -static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int err = 0; + + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) + err = PTR_ERR(dcc->f2fs_issue_discard); + + return err; +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ struct discard_cmd_control *dcc; int err = 0, i; @@ -2155,13 +2171,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: - dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, - "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) { - err = PTR_ERR(dcc->f2fs_issue_discard); + err = f2fs_start_discard_thread(sbi); + if (err) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; - return err; } return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f5148f2fd884..d18271f3d190 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2105,12 +2105,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false, need_stop_gc = false; bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; + bool need_restart_discard = false, need_stop_discard = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); + bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); + struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2282,11 +2285,26 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_flush = true; } + if (no_discard == !!test_opt(sbi, DISCARD)) { + if (test_opt(sbi, DISCARD)) { + err = f2fs_start_discard_thread(sbi); + if (err) + goto restore_flush; + need_stop_discard = true; + } else { + dcc = SM_I(sbi)->dcc_info; + f2fs_stop_discard_thread(sbi); + if (atomic_read(&dcc->discard_cmd_cnt)) + f2fs_issue_discard_timeout(sbi); + need_restart_discard = true; + } + } + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto restore_flush; + goto restore_discard; } else { f2fs_enable_checkpoint(sbi); } @@ -2306,6 +2324,13 @@ skip: adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_discard: + if (need_restart_discard) { + if (f2fs_start_discard_thread(sbi)) + f2fs_warn(sbi, "discard has been stopped"); + } else if (need_stop_discard) { + f2fs_stop_discard_thread(sbi); + } restore_flush: if (need_restart_flush) { if (f2fs_create_flush_cmd_control(sbi)) -- cgit v1.2.3-59-g8ed1b From d75da8c8a4c5c761936e4a51403f5f21e3aba935 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Aug 2021 08:11:38 +0800 Subject: f2fs: adjust unlock order for cleanup This patch adjusts unlock order of .i_mmap_sem and .i_gc_rwsem for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ab4ea2ddcc8b..cc2080866c54 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3497,8 +3497,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) released_blocks += ret; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); out: inode_unlock(inode); @@ -3650,8 +3650,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) reserved_blocks += ret; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); -- cgit v1.2.3-59-g8ed1b From ad126ebddecbf696e0cf214ff56c7b170fa9f0f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Aug 2021 08:12:08 +0800 Subject: f2fs: fix to account missing .skipped_gc_rwsem There is a missing place we forgot to account .skipped_gc_rwsem, fix it. Fixes: 6f8d4455060d ("f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2c18443972b6..77391e3b7d68 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1500,8 +1500,10 @@ next_step: int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; continue; + } if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; -- cgit v1.2.3-59-g8ed1b From adf9ea89c719c1d23794e363f631e376b3ff8cbc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Aug 2021 10:03:15 +0800 Subject: f2fs: fix unexpected ENOENT comes from f2fs_map_blocks() In below path, it will return ENOENT if filesystem is shutdown: - f2fs_map_blocks - f2fs_get_dnode_of_data - f2fs_get_node_page - __get_node_page - read_node_page - is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) return -ENOENT - force return value from ENOENT to 0 It should be fine for read case, since it indicates a hole condition, and caller could use .m_next_pgofs to skip the hole and continue the lookup. However it may cause confusing for write case, since leaving a hole there, and said nothing was wrong doesn't help. There is at least one case from dax_iomap_actor() will complain that, so fix this in prior to supporting dax in f2fs. xfstest generic/388 reports below warning: ubuntu godown: xfstests-induced forced shutdown of /mnt/scratch_f2fs: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 485833 at fs/dax.c:1127 dax_iomap_actor+0x339/0x370 Call Trace: iomap_apply+0x1c4/0x7b0 ? dax_iomap_rw+0x1c0/0x1c0 dax_iomap_rw+0xad/0x1c0 ? dax_iomap_rw+0x1c0/0x1c0 f2fs_file_write_iter+0x5ab/0x970 [f2fs] do_iter_readv_writev+0x273/0x2e0 do_iter_write+0xab/0x1f0 vfs_iter_write+0x21/0x40 iter_file_splice_write+0x287/0x540 do_splice+0x37c/0xa60 __x64_sys_splice+0x15f/0x3a0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae ubuntu godown: xfstests-induced forced shutdown of /mnt/scratch_f2fs: ------------[ cut here ]------------ RIP: 0010:dax_iomap_pte_fault.isra.0+0x72e/0x14a0 Call Trace: dax_iomap_fault+0x44/0x70 f2fs_dax_huge_fault+0x155/0x400 [f2fs] f2fs_dax_fault+0x18/0x30 [f2fs] __do_fault+0x4e/0x120 do_fault+0x3cf/0x7a0 __handle_mm_fault+0xa8c/0xf20 ? find_held_lock+0x39/0xd0 handle_mm_fault+0x1b6/0x480 do_user_addr_fault+0x320/0xcd0 ? rcu_read_lock_sched_held+0x67/0xc0 exc_page_fault+0x77/0x3f0 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 Fixes: 83a3bfdb5a8a ("f2fs: indicate shutdown f2fs to allow unmount successfully") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5e4120b92f59..8e8824605f83 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1504,7 +1504,21 @@ next_dnode: if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; + if (err == -ENOENT) { + /* + * There is one exceptional case that read_node_page() + * may return -ENOENT due to filesystem has been + * shutdown or cp_error, so force to convert error + * number to EIO for such case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_cp_error(sbi))) { + err = -EIO; + goto unlock_out; + } + err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = -- cgit v1.2.3-59-g8ed1b From c8dc3047c48540183744f959412d44b08c5435e1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Aug 2021 19:34:19 +0800 Subject: f2fs: fix to unmap pages from userspace process in punch_hole() We need to unmap pages from userspace process before removing pagecache in punch_hole() like we did in f2fs_setattr(). Similar change: commit 5e44f8c374dc ("ext4: hole-punch use truncate_pagecache_range") Fixes: fbfa2cc58d53 ("f2fs: add file operations") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cc2080866c54..7fca3ac75e9f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1107,7 +1107,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (pg_start < pg_end) { - struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1119,8 +1118,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_inode_pages_range(mapping, blk_start, - blk_end - 1); + truncate_pagecache_range(inode, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); -- cgit v1.2.3-59-g8ed1b From dddd3d65293a52c2c3850c19b1e5115712e534d8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Aug 2021 14:00:57 -0700 Subject: f2fs: guarantee to write dirty data when enabling checkpoint back We must flush all the dirty data when enabling checkpoint back. Let's guarantee that first by adding a retry logic on sync_inodes_sb(). In addition to that, this patch adds to flush data in fsync when checkpoint is disabled, which can mitigate the sync_inodes_sb() failures in advance. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++--- fs/f2fs/super.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7fca3ac75e9f..f30b841d4e5a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -263,8 +263,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, }; unsigned int seq_id = 0; - if (unlikely(f2fs_readonly(inode->i_sb) || - is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); @@ -278,7 +277,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, ret = file_write_and_wait_range(file, start, end); clear_inode_flag(inode, FI_NEED_IPU); - if (ret) { + if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d18271f3d190..703fe8991838 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2083,8 +2083,17 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { + int retry = DEFAULT_RETRY_IO_COUNT; + /* we should flush all the data to keep data consistency */ - sync_inodes_sb(sbi->sb); + do { + sync_inodes_sb(sbi->sb); + cond_resched(); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); + + if (unlikely(retry < 0)) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); -- cgit v1.2.3-59-g8ed1b From f7db8dd6981e0d94e5e35b45c1d288c94357de52 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 30 Aug 2021 08:35:33 +0800 Subject: f2fs: enable realtime discard iff device supports discard Let's only enable realtime discard if and only if device supports discard functionality. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 703fe8991838..a29e6714c610 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -661,10 +661,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; break; case Opt_discard: + if (!f2fs_hw_support_discard(sbi)) { + f2fs_warn(sbi, "device does not support discard"); + break; + } set_opt(sbi, DISCARD); break; case Opt_nodiscard: - if (f2fs_sb_has_blkzoned(sbi)) { + if (f2fs_hw_should_discard(sbi)) { f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } @@ -2001,7 +2005,8 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - set_opt(sbi, DISCARD); + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; -- cgit v1.2.3-59-g8ed1b From 827f02842e40ea2e00f401e8f4cb1bccf3b8cd86 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 30 Aug 2021 11:37:32 -0700 Subject: f2fs: deallocate compressed pages when error happens In f2fs_write_multi_pages(), f2fs_compress_pages() allocates pages for compression work in cc->cpages[]. Then, f2fs_write_compressed_pages() initiates bio submission. But, if there's any error before submitting the IOs like early f2fs_cp_error(), previously it didn't free cpages by f2fs_compress_free_page(). Let's fix memory leak by putting that just before deallocating cc->cpages. Fixes: 4c8ff7095bef ("f2fs: support data compression") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index ec70a0a32327..c1bf9ad4c220 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1394,12 +1394,6 @@ out_destroy_crypt: for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; - f2fs_compress_free_page(cc->cpages[i]); - cc->cpages[i] = NULL; - } out_put_cic: kmem_cache_free(cic_entry_slab, cic); out_put_dnode: @@ -1410,6 +1404,12 @@ out_unlock_op: else f2fs_unlock_op(sbi); out_free: + for (i = 0; i < cc->nr_cpages; i++) { + if (!cc->cpages[i]) + continue; + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; -- cgit v1.2.3-59-g8ed1b From 9605f75cf36e0bcc0f4ada07b5be712d30107607 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 30 Aug 2021 13:30:45 -0700 Subject: f2fs: should put a page beyond EOF when preparing a write The prepare_compress_overwrite() gets/locks a page to prepare a read, and calls f2fs_read_multi_pages() which checks EOF first. If there's any page beyond EOF, we unlock the page and set cc->rpages[i] = NULL, which we can't put the page anymore. This makes page leak, so let's fix by putting that page. Fixes: a949dc5f2c5c ("f2fs: compress: fix race condition of overwrite vs truncate") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8e8824605f83..41d29382eced 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2183,6 +2183,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, continue; } unlock_page(page); + if (for_write) + put_page(page); cc->rpages[i] = NULL; cc->nr_rpages--; } -- cgit v1.2.3-59-g8ed1b