From 7f2ecdd837ae0a27149a0387b2534e11d955c5f8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 28 Jun 2018 19:34:40 -0700 Subject: f2fs: flush journal nat entries for nat_bits during unmount Let's flush journal nat entries for speed up in the next run. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 10643b11bd59..b0267d3823b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2582,6 +2582,13 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) nid_t set_idx = 0; LIST_HEAD(sets); + /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + if (enabled_nat_bits(sbi, cpc)) { + down_write(&nm_i->nat_tree_lock); + remove_nats_in_journal(sbi); + up_write(&nm_i->nat_tree_lock); + } + if (!nm_i->dirty_nat_cnt) return; -- cgit v1.2.3-59-g8ed1b From 8a56dd9685d6531d09b370ab22a61b9687131875 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Jun 2018 18:55:12 -0700 Subject: f2fs: allow wrong configured dio to buffered write This fixes to support dio having unaligned buffers as buffered writes. xfs_io -f -d -c "pwrite 0 512" $testfile -> okay xfs_io -f -d -c "pwrite 1 512" $testfile -> EINVAL Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8f931d699287..5e53d210e222 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2371,14 +2371,20 @@ unlock_out: static int check_direct_IO(struct inode *inode, struct iov_iter *iter, loff_t offset) { - unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - - if (offset & blocksize_mask) - return -EINVAL; - - if (iov_iter_alignment(iter) & blocksize_mask) - return -EINVAL; - + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; + unsigned blocksize_mask = (1 << blkbits) - 1; + unsigned long align = offset | iov_iter_alignment(iter); + struct block_device *bdev = inode->i_sb->s_bdev; + + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + return -EINVAL; + return 1; + } return 0; } @@ -2396,7 +2402,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) err = check_direct_IO(inode, iter, offset); if (err) - return err; + return err < 0 ? err : 0; if (f2fs_force_buffered_io(inode, rw)) return 0; -- cgit v1.2.3-59-g8ed1b From 1cb50f87e10696e8cc61fb62d0d948e11b0e6dc1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 Jul 2018 16:47:34 -0700 Subject: f2fs: do checkpoint in kill_sb When unmounting f2fs in force mode, we can get it stuck by io_schedule() by some pending IOs in meta_inode. io_schedule+0xd/0x30 wait_on_page_bit_common+0xc6/0x130 __filemap_fdatawait_range+0xbd/0x100 filemap_fdatawait_keep_errors+0x15/0x40 sync_inodes_sb+0x1cf/0x240 sync_filesystem+0x52/0x90 generic_shutdown_super+0x1d/0x110 kill_f2fs_super+0x28/0x80 [f2fs] deactivate_locked_super+0x35/0x60 cleanup_mnt+0x36/0x70 task_work_run+0x79/0xa0 exit_to_usermode_loop+0x62/0x70 do_syscall_64+0xdb/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 0xffffffffffffffff Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3995e926ba3a..1dc6809fac38 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3089,9 +3089,19 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { if (sb->s_root) { - set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); - f2fs_stop_gc_thread(F2FS_SB(sb)); - f2fs_stop_discard_thread(F2FS_SB(sb)); + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + f2fs_write_checkpoint(sbi, &cpc); + } } kill_block_super(sb); } -- cgit v1.2.3-59-g8ed1b From af697c0f5c5b8798832e651baf23460d588393de Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jul 2018 18:30:42 -0700 Subject: f2fs: keep meta pages in cp_error state It turns out losing meta pages in shutdown period makes f2fs very unstable so that I could see many unexpected error conditions. Let's keep meta pages for fault injection and sudden power-off tests. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9f1c96caebda..fe92d2372f4a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -242,11 +242,8 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); - if (unlikely(f2fs_cp_error(sbi))) { - dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); - return 0; - } + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) @@ -1129,6 +1126,9 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; + if (unlikely(f2fs_cp_error(sbi))) + break; + io_schedule_timeout(5*HZ); } finish_wait(&sbi->cp_wait, &wait); @@ -1202,8 +1202,12 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, /* writeout cp pack 2 page */ err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); - f2fs_bug_on(sbi, err); + if (unlikely(err && f2fs_cp_error(sbi))) { + f2fs_put_page(page, 1); + return; + } + f2fs_bug_on(sbi, err); f2fs_put_page(page, 0); /* submit checkpoint (with barrier if NOBARRIER is not set) */ @@ -1229,7 +1233,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) while (get_pages(sbi, F2FS_DIRTY_META)) { f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + break; } /* @@ -1309,7 +1313,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + break; } } @@ -1350,9 +1354,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) @@ -1364,9 +1365,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_release_ino_entry(sbi, false); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); @@ -1381,7 +1379,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); - return 0; + return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } /* -- cgit v1.2.3-59-g8ed1b From 83a3bfdb5a8a086290dff2c13409c7380b683a96 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 21 Jun 2018 13:46:23 -0700 Subject: f2fs: indicate shutdown f2fs to allow unmount successfully Once we shutdown f2fs, we have to flush stale pages in order to unmount the system. In order to make stable, we need to stop fault injection as well. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/f2fs.h | 7 +++++++ fs/f2fs/file.c | 4 ++++ fs/f2fs/inode.c | 3 +++ fs/f2fs/node.c | 3 ++- fs/f2fs/super.c | 5 +---- 6 files changed, 18 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fe92d2372f4a..1a3ec978f1a6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -28,6 +28,7 @@ struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { + f2fs_build_fault_attr(sbi, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); if (!end_io) f2fs_flush_merged_writes(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d8b1de83143..fe80eb637075 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1066,6 +1066,7 @@ enum { SBI_POR_DOING, /* recovery is doing or not */ SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ + SBI_IS_SHUTDOWN, /* shutdown by ioctl */ }; enum { @@ -3373,4 +3374,10 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) F2FS_I_SB(inode)->s_ndevs); } +#ifdef CONFIG_F2FS_FAULT_INJECTION +extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate); +#else +#define f2fs_build_fault_attr(sbi, rate) do { } while (0) +#endif + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6880c6f78d58..8af6683e022b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1889,6 +1889,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) } if (sb) { f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev, sb); } break; @@ -1898,13 +1899,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) goto out; f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; default: ret = -EINVAL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f121c864f4c0..f91dd017a65c 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -159,6 +159,9 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_inode *ri; __u32 provided, calculated; + if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) + return true; + if (!f2fs_enable_inode_chksum(sbi, page) || PageDirty(page) || PageWriteback(page)) return true; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b0267d3823b4..1061dd18b09c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1146,7 +1146,8 @@ static int read_node_page(struct page *page, int op_flags) f2fs_get_node_info(sbi, page->index, &ni); - if (unlikely(ni.blk_addr == NULL_ADDR)) { + if (unlikely(ni.blk_addr == NULL_ADDR) || + is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1dc6809fac38..1cb5d1e4fcfd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -57,8 +57,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_CHECKPOINT] = "checkpoint error", }; -static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, - unsigned int rate) +void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; @@ -1379,9 +1378,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, POSIX_ACL); #endif -#ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, 0); -#endif } #ifdef CONFIG_QUOTA -- cgit v1.2.3-59-g8ed1b From a1933c09ef84c2fd187e05b560ddc6e1267d6508 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 15 Jul 2018 09:58:08 +0900 Subject: f2fs: avoid potential deadlock in f2fs_sbi_store [ 155.018460] ====================================================== [ 155.021431] WARNING: possible circular locking dependency detected [ 155.024339] 4.18.0-rc3+ #5 Tainted: G OE [ 155.026879] ------------------------------------------------------ [ 155.029783] umount/2901 is trying to acquire lock: [ 155.032187] 00000000c4282f1f (kn->count#130){++++}, at: kernfs_remove+0x1f/0x30 [ 155.035439] [ 155.035439] but task is already holding lock: [ 155.038892] 0000000056e4307b (&type->s_umount_key#41){++++}, at: deactivate_super+0x33/0x50 [ 155.042602] [ 155.042602] which lock already depends on the new lock. [ 155.042602] [ 155.047465] [ 155.047465] the existing dependency chain (in reverse order) is: [ 155.051354] [ 155.051354] -> #1 (&type->s_umount_key#41){++++}: [ 155.054768] f2fs_sbi_store+0x61/0x460 [f2fs] [ 155.057083] kernfs_fop_write+0x113/0x1a0 [ 155.059277] __vfs_write+0x36/0x180 [ 155.061250] vfs_write+0xbe/0x1b0 [ 155.063179] ksys_write+0x55/0xc0 [ 155.065068] do_syscall_64+0x60/0x1b0 [ 155.067071] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 155.069529] [ 155.069529] -> #0 (kn->count#130){++++}: [ 155.072421] __kernfs_remove+0x26f/0x2e0 [ 155.074452] kernfs_remove+0x1f/0x30 [ 155.076342] kobject_del.part.5+0xe/0x40 [ 155.078354] f2fs_put_super+0x12d/0x290 [f2fs] [ 155.080500] generic_shutdown_super+0x6c/0x110 [ 155.082655] kill_block_super+0x21/0x50 [ 155.084634] kill_f2fs_super+0x9c/0xc0 [f2fs] [ 155.086726] deactivate_locked_super+0x3f/0x70 [ 155.088826] cleanup_mnt+0x3b/0x70 [ 155.090584] task_work_run+0x93/0xc0 [ 155.092367] exit_to_usermode_loop+0xf0/0x100 [ 155.094466] do_syscall_64+0x162/0x1b0 [ 155.096312] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 155.098603] [ 155.098603] other info that might help us debug this: [ 155.098603] [ 155.102418] Possible unsafe locking scenario: [ 155.102418] [ 155.105134] CPU0 CPU1 [ 155.107037] ---- ---- [ 155.108910] lock(&type->s_umount_key#41); [ 155.110674] lock(kn->count#130); [ 155.113010] lock(&type->s_umount_key#41); [ 155.115608] lock(kn->count#130); Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2e7e611deaef..d3d0266a49da 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -286,8 +286,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || a->struct_type == GC_THREAD); - if (gc_entry) - down_read(&sbi->sb->s_umount); + if (gc_entry) { + if (!down_read_trylock(&sbi->sb->s_umount)) + return -EAGAIN; + } ret = __sbi_store(a, sbi, buf, count); if (gc_entry) up_read(&sbi->sb->s_umount); -- cgit v1.2.3-59-g8ed1b From 76a45e3c45002836b2ea0711a130403dcd7d8f52 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Jun 2018 23:55:01 +0800 Subject: f2fs: don't acquire orphan ino during recovery During orphan inode recovery, checkpoint should never succeed due to SBI_POR_DOING flag, so we don't need acquire orphan ino which only be used by checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1a3ec978f1a6..502d7afd71a2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -570,12 +570,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; - int err = f2fs_acquire_orphan_inode(sbi); - - if (err) - goto err_out; - - __add_ino_entry(sbi, ino, 0, ORPHAN_INO); + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { @@ -605,7 +600,6 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) err = -EIO; goto err_out; } - __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; err_out: -- cgit v1.2.3-59-g8ed1b From 0aa7e0f8c0a0246ea7f407e32b7277ec9c084c15 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Jun 2018 23:55:02 +0800 Subject: f2fs: move s_res{u,g}id initialization to default_options() Let default_options() initialize s_res{u,g}id with default value like other options. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1cb5d1e4fcfd..a87a398af77f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1354,6 +1354,8 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).test_dummy_encryption = false; + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -2697,9 +2699,6 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; - F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); - F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, -- cgit v1.2.3-59-g8ed1b From 4cac90d5491c9a157909b9d45128997d35624e01 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Jun 2018 18:02:01 +0800 Subject: f2fs: relocate readdir_ra configure initialization readdir_ra is sysfs configuration instead of mount option, so it should not be initialized in default_options(), otherwise after remount, it can be reset to be enabled which may not as user wish, so let's move it to f2fs_tuning_parameters(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a87a398af77f..89a347243e9a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1356,7 +1356,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - sbi->readdir_ra = 1; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -2650,6 +2649,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } + + sbi->readdir_ra = 1; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) -- cgit v1.2.3-59-g8ed1b From 4e423832a675af671eab9d8a3d96636aa63faa9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Jun 2018 18:02:02 +0800 Subject: f2fs: fix error path of fill_super In fill_super, if root inode's attribute is incorrect, we need to call f2fs_destroy_stats to release stats memory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 89a347243e9a..01d1cb6081fc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2911,7 +2911,7 @@ try_onemore: if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); err = -EINVAL; - goto free_node_inode; + goto free_stats; } sb->s_root = d_make_root(root); /* allocate root dentry */ -- cgit v1.2.3-59-g8ed1b From 54c55c4e4fc7ec35f96a3b6a626314b0b7256137 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Fri, 9 Mar 2018 23:10:21 +0800 Subject: f2fs: support in-memory inode checksum when checking consistency Enable in-memory inode checksum to protect metadata blocks from in-memory scribbles when checking consistency, which has no performance requirements. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 8 ++++++++ fs/f2fs/node.c | 10 +++++++++- fs/f2fs/node.h | 4 ++++ 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f91dd017a65c..ec672c7ac52c 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -162,8 +162,12 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) return true; +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_enable_inode_chksum(sbi, page)) +#else if (!f2fs_enable_inode_chksum(sbi, page) || PageDirty(page) || PageWriteback(page)) +#endif return true; ri = &F2FS_NODE(page)->i; @@ -477,6 +481,10 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); +#endif } void f2fs_update_inode_page(struct inode *inode) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1061dd18b09c..3d1240949d87 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1141,8 +1141,12 @@ static int read_node_page(struct page *page, int op_flags) .encrypted_page = NULL, }; - if (PageUptodate(page)) + if (PageUptodate(page)) { +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_bug_on(sbi, !f2fs_inode_chksum_verify(sbi, page)); +#endif return LOCKED_PAGE; + } f2fs_get_node_info(sbi, page->index, &ni); @@ -1775,6 +1779,10 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); +#ifdef CONFIG_F2FS_CHECK_FS + if (IS_INODE(page)) + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index b95e49e4a928..8f34bdffde93 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -444,6 +444,10 @@ static inline void set_mark(struct page *page, int mark, int type) else flag &= ~(0x1 << type); rn->footer.flag = cpu_to_le32(flag); + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif } #define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) #define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) -- cgit v1.2.3-59-g8ed1b From e2374015f27fe5ee5d5c37966e2faf396cdaaa65 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Jun 2018 14:45:57 +0800 Subject: f2fs: fix to propagate return value of scan_nat_page() As Anatoly Trosinenko reported in bugzilla: How to reproduce: 1. Compile the 73fcb1a370c76 version of the kernel using the config attached 2. Unpack and mount the attached filesystem image as F2FS 3. The kernel will BUG() on mount (BUGs are explicitly enabled in config) [ 2.233612] F2FS-fs (sda): Found nat_bits in checkpoint [ 2.248422] ------------[ cut here ]------------ [ 2.248857] kernel BUG at fs/f2fs/node.c:1967! [ 2.249760] invalid opcode: 0000 [#1] SMP NOPTI [ 2.250219] Modules linked in: [ 2.251848] CPU: 0 PID: 944 Comm: mount Not tainted 4.17.0-rc5+ #1 [ 2.252331] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 2.253305] RIP: 0010:build_free_nids+0x337/0x3f0 [ 2.253672] RSP: 0018:ffffae7fc0857c50 EFLAGS: 00000246 [ 2.254080] RAX: 00000000ffffffff RBX: 0000000000000123 RCX: 0000000000000001 [ 2.254638] RDX: ffff9aa7063d5c00 RSI: 0000000000000122 RDI: ffff9aa705852e00 [ 2.255190] RBP: ffff9aa705852e00 R08: 0000000000000001 R09: ffff9aa7059090c0 [ 2.255719] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9aa705852e00 [ 2.256242] R13: ffff9aa7063ad000 R14: ffff9aa705919000 R15: 0000000000000123 [ 2.256809] FS: 00000000023078c0(0000) GS:ffff9aa707800000(0000) knlGS:0000000000000000 [ 2.258654] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.259153] CR2: 00000000005511ae CR3: 0000000005872000 CR4: 00000000000006f0 [ 2.259801] Call Trace: [ 2.260583] build_node_manager+0x5cd/0x600 [ 2.260963] f2fs_fill_super+0x66a/0x17c0 [ 2.261300] ? f2fs_commit_super+0xe0/0xe0 [ 2.261622] mount_bdev+0x16e/0x1a0 [ 2.261899] mount_fs+0x30/0x150 [ 2.262398] vfs_kern_mount.part.28+0x4f/0xf0 [ 2.262743] do_mount+0x5d0/0xc60 [ 2.263010] ? _copy_from_user+0x37/0x60 [ 2.263313] ? memdup_user+0x39/0x60 [ 2.263692] ksys_mount+0x7b/0xd0 [ 2.263960] __x64_sys_mount+0x1c/0x20 [ 2.264268] do_syscall_64+0x43/0xf0 [ 2.264560] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2.265095] RIP: 0033:0x48d31a [ 2.265502] RSP: 002b:00007ffc6fe60a08 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [ 2.266089] RAX: ffffffffffffffda RBX: 0000000000008000 RCX: 000000000048d31a [ 2.266607] RDX: 00007ffc6fe62fa5 RSI: 00007ffc6fe62f9d RDI: 00007ffc6fe62f94 [ 2.267130] RBP: 00000000023078a0 R08: 0000000000000000 R09: 0000000000000000 [ 2.267670] R10: 0000000000008000 R11: 0000000000000246 R12: 0000000000000000 [ 2.268192] R13: 0000000000000000 R14: 00007ffc6fe60c78 R15: 0000000000000000 [ 2.268767] Code: e8 5f c3 ff ff 83 c3 01 41 83 c7 01 81 fb c7 01 00 00 74 48 44 39 7d 04 76 42 48 63 c3 48 8d 04 c0 41 8b 44 06 05 83 f8 ff 75 c1 <0f> 0b 49 8b 45 50 48 8d b8 b0 00 00 00 e8 37 59 69 00 b9 01 00 [ 2.270434] RIP: build_free_nids+0x337/0x3f0 RSP: ffffae7fc0857c50 [ 2.271426] ---[ end trace ab20c06cd3c8fde4 ]--- During loading NAT entries, we will do sanity check, once the entry info is corrupted, it will cause BUG_ON directly to protect user data from being overwrited. In this case, it will be better to just return failure on mount() instead of panic, so that user can get hint from kmsg and try fsck for recovery immediately rather than after an abnormal reboot. https://bugzilla.kernel.org/show_bug.cgi?id=199769 Reported-by: Anatoly Trosinenko Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 42 ++++++++++++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 13 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fe80eb637075..0acf8889a58f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2813,7 +2813,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); -void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3d1240949d87..1d590c64bc85 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1977,7 +1977,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void scan_nat_page(struct f2fs_sb_info *sbi, +static int scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1995,7 +1995,10 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - f2fs_bug_on(sbi, blk_addr == NEW_ADDR); + + if (blk_addr == NEW_ADDR) + return -EINVAL; + if (blk_addr == NULL_ADDR) { add_free_nid(sbi, start_nid, true, true); } else { @@ -2004,6 +2007,8 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, spin_unlock(&NM_I(sbi)->nid_list_lock); } } + + return 0; } static void scan_curseg_cache(struct f2fs_sb_info *sbi) @@ -2059,11 +2064,11 @@ out: up_read(&nm_i->nat_tree_lock); } -static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, +static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int i = 0; + int i = 0, ret; nid_t nid = nm_i->next_scan_nid; if (unlikely(nid >= nm_i->max_nid)) @@ -2071,17 +2076,17 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, /* Enough entries */ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) - return; + return 0; if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) - return; + return 0; if (!mount) { /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) - return; + return 0; } /* readahead nat pages to be scanned */ @@ -2095,8 +2100,16 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, nm_i->nat_block_bitmap)) { struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); + ret = scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); + + if (ret) { + up_read(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, !mount); + f2fs_msg(sbi->sb, KERN_ERR, + "NAT is corrupt, run fsck to fix it"); + return -EINVAL; + } } nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); @@ -2117,13 +2130,19 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); + + return 0; } -void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { + int ret; + mutex_lock(&NM_I(sbi)->build_lock); - __f2fs_build_free_nids(sbi, sync, mount); + ret = __f2fs_build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); + + return ret; } /* @@ -2817,8 +2836,7 @@ int f2fs_build_node_manager(struct f2fs_sb_info *sbi) /* load free nid status from nat_bits table */ load_free_nid_bitmap(sbi); - f2fs_build_free_nids(sbi, true, true); - return 0; + return f2fs_build_free_nids(sbi, true, true); } void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) -- cgit v1.2.3-59-g8ed1b From 5a6154920faf9d3d5a39de1da49c66af9831d270 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 20 Jun 2018 21:27:21 -0700 Subject: f2fs: don't issue discard commands in online discard is on Actually, we don't need to issue discard commands, if discard is on, as mentioned in the comment. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9efce174c51a..6dc8828b4d87 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2469,23 +2469,24 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (err) goto out; - start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, end_segno + 1); - - __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); - __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - /* * We filed discard candidates, but actually we don't need to wait for * all of them, since they'll be issued in idle time along with runtime * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ - if (!test_opt(sbi, DISCARD)) { - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + if (test_opt(sbi, DISCARD)) + goto out; + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - range->len = F2FS_BLK_TO_BYTES(trimmed); - } + range->len = F2FS_BLK_TO_BYTES(trimmed); out: return err; } -- cgit v1.2.3-59-g8ed1b From 2a96d8ad94ce57cb0072f7a660b1039720c47716 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jun 2018 13:39:53 +0300 Subject: f2fs: Fix uninitialized return in f2fs_ioc_shutdown() "ret" can be uninitialized on the success path when "in == F2FS_GOING_DOWN_FULLSYNC". Fixes: 60b2b4ee2bc0 ("f2fs: Fix deadlock in shutdown ioctl") Signed-off-by: Dan Carpenter Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8af6683e022b..eed8aef51dad 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1866,7 +1866,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; - int ret; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3-59-g8ed1b From 2a63531a612f776ea754fee6272fc8f16a06b9d6 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 21 Jun 2018 14:49:06 +0800 Subject: f2fs: fix a hungtask problem caused by congestion_wait This patch fix hungtask problem which can be reproduced as follow: Thread 0~3: while true do touch /xxx/test/file_xxx done Thread 4 write a new checkpoint every three seconds. In the meantime, fio start 16 threads for randwrite. With my debug info, cycles num will exceed 1000 in function f2fs_sync_dirty_inodes, and most of cycle will be dropped into congestion_wait() and sleep more than 20ms. Cycles num reduced to 3 with this patch. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 502d7afd71a2..e5cf2ff5b39d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -972,12 +972,10 @@ retry: iput(inode); /* We need to give cpu to another writers. */ - if (ino == cur_ino) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + if (ino == cur_ino) cond_resched(); - } else { + else ino = cur_ino; - } } else { /* * We should submit bio, since it exists several -- cgit v1.2.3-59-g8ed1b From e2e59414aae2c8036dfaa57cf6a578e1694945e8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 21 Jun 2018 11:29:43 -0700 Subject: f2fs: assign REQ_RAHEAD to bio for ->readpages As Jens reported, we'd better assign REQ_RAHEAD to bio by the fact that ->readpages is called only from read-ahead. In Documentation/filesystems/vfs.txt, readpages: called by the VM to read pages associated with the address_space object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. Signed-off-by: Jens Axboe Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5e53d210e222..b6a90c853221 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -534,7 +534,7 @@ out: } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) + unsigned nr_pages, unsigned op_flag) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -546,7 +546,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return ERR_PTR(-ENOMEM); f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; @@ -571,7 +571,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr) { - struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1421,10 +1421,15 @@ out: /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. + * + * Note that the aops->readpages() function is ONLY used for read-ahead. If + * this function ever deviates from doing just read-ahead, it should either + * use ->readpage() or do the necessary surgery to decouple ->readpages() + * from read-ahead. */ static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, - unsigned nr_pages) + unsigned nr_pages, bool is_readahead) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -1514,7 +1519,8 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, + is_readahead ? REQ_RAHEAD : 0); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; @@ -1558,7 +1564,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1); + ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1, false); return ret; } @@ -1575,7 +1581,7 @@ static int f2fs_read_data_pages(struct file *file, if (f2fs_has_inline_data(inode)) return 0; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); + return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); } static int encrypt_one_page(struct f2fs_io_info *fio) -- cgit v1.2.3-59-g8ed1b From 6aead1617b3adf2b7e2c56f0f13e4e0ee42ebb4a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Jun 2018 22:38:28 +0800 Subject: f2fs: fix to wait on page writeback before updating page In error path of f2fs_move_rehashed_dirents, inode page could be writeback state, so we should wait on inode page writeback before updating it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 043830be5662..9a245d2d5b7c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -477,6 +477,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); + f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); -- cgit v1.2.3-59-g8ed1b From 24b81dfcb73f2dc21c61502512d1422f15a579dc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jun 2018 10:02:19 +0200 Subject: f2fs: use timespec64 for inode timestamps The on-disk representation and the vfs both use 64-bit tv_sec values, so let's change the last missing piece in the middle. Signed-off-by: Arnd Bergmann Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 16 ++++++---------- fs/f2fs/inode.c | 12 ++++++------ fs/f2fs/namei.c | 2 +- 3 files changed, 13 insertions(+), 17 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0acf8889a58f..15caa790f54e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -669,8 +669,8 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ - struct timespec i_crtime; /* inode creation time */ - struct timespec i_disk_time[4]; /* inode disk times */ + struct timespec64 i_crtime; /* inode creation time */ + struct timespec64 i_disk_time[4];/* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -2519,7 +2519,6 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { - struct timespec ts; bool ret; if (dsync) { @@ -2535,16 +2534,13 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; - ts = timespec64_to_timespec(inode->i_atime); - if (!timespec_equal(F2FS_I(inode)->i_disk_time, &ts)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) return false; - ts = timespec64_to_timespec(inode->i_ctime); - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) return false; - ts = timespec64_to_timespec(inode->i_mtime); - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, &F2FS_I(inode)->i_crtime)) return false; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ec672c7ac52c..2076225787d1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -304,9 +304,9 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } - F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); - F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); - F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; f2fs_put_page(node_page, 1); @@ -477,9 +477,9 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); - F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); - F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; #ifdef CONFIG_F2FS_CHECK_FS diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 231b7f3ea7d3..2ea0de4cbe76 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -51,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - F2FS_I(inode)->i_crtime = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_crtime = inode->i_mtime; inode->i_generation = sbi->s_next_generation++; if (S_ISDIR(inode->i_mode)) -- cgit v1.2.3-59-g8ed1b From e1da7872f6eda977bd812346bf588c35e4495a1e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Jun 2018 17:44:11 +0800 Subject: f2fs: introduce and spread verify_blkaddr This patch introduces verify_blkaddr to check meta/data block address with valid range to detect bug earlier. In addition, once we encounter an invalid blkaddr, notice user to run fsck to fix, and let the kernel panic. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++++++-- fs/f2fs/data.c | 8 ++++---- fs/f2fs/f2fs.h | 33 +++++++++++++++++++++++++++++---- fs/f2fs/file.c | 9 +++++---- fs/f2fs/inode.c | 7 ++++--- fs/f2fs/node.c | 4 ++-- fs/f2fs/recovery.c | 6 +++--- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 8 +++----- 9 files changed, 60 insertions(+), 29 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e5cf2ff5b39d..94552286ac12 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -120,7 +120,7 @@ struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, false); } -bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { @@ -141,10 +141,16 @@ bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, return false; break; case META_POR: + case DATA_GENERIC: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) return false; break; + case META_GENERIC: + if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || + blkaddr >= MAIN_BLKADDR(sbi))) + return false; + break; default: BUG(); } @@ -177,7 +183,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!f2fs_is_valid_meta_blkaddr(sbi, blkno, type)) + if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) goto out; switch (type) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b6a90c853221..635a98db5d65 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -485,7 +485,7 @@ next: spin_unlock(&io->io_lock); } - if (is_valid_blkaddr(fio->old_blkaddr)) + if (__is_valid_data_blkaddr(fio->old_blkaddr)) verify_block_addr(fio, fio->old_blkaddr); verify_block_addr(fio, fio->new_blkaddr); @@ -1045,7 +1045,7 @@ next_dnode: next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (!is_valid_blkaddr(blkaddr)) { + if (!is_valid_data_blkaddr(sbi, blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1700,7 +1700,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (is_valid_blkaddr(fio->old_blkaddr)) { + if (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr)) { ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; @@ -1727,7 +1727,7 @@ got_it: * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || (is_valid_blkaddr(fio->old_blkaddr) && + if (ipu_force || (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) && need_inplace_update(fio))) { err = encrypt_one_page(fio); if (err) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 15caa790f54e..5e236e26641e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -194,7 +194,7 @@ struct cp_control { }; /* - * For CP/NAT/SIT/SSA readahead + * indicate meta/data type */ enum { META_CP, @@ -202,6 +202,8 @@ enum { META_SIT, META_SSA, META_POR, + DATA_GENERIC, + META_GENERIC, }; /* for the list of ino */ @@ -2666,13 +2668,36 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } -static inline bool is_valid_blkaddr(block_t blkaddr) +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); +static inline void verify_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { + f2fs_msg(sbi->sb, KERN_ERR, + "invalid blkaddr: %u, type: %d, run fsck to fix.", + blkaddr, type); + f2fs_bug_on(sbi, 1); + } +} + +static inline bool __is_valid_data_blkaddr(block_t blkaddr) { if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return false; return true; } +static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + if (!__is_valid_data_blkaddr(blkaddr)) + return false; + verify_blkaddr(sbi, blkaddr, DATA_GENERIC); + return true; +} + /* * file.c */ @@ -2896,8 +2921,8 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, - block_t blkaddr, int type); +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index eed8aef51dad..0d1ee20912b9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -350,13 +350,13 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, return pgofs; } -static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, - int whence) +static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, + pgoff_t dirty, pgoff_t pgofs, int whence) { switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || - is_valid_blkaddr(blkaddr)) + is_valid_data_blkaddr(sbi, blkaddr)) return true; break; case SEEK_HOLE: @@ -420,7 +420,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (__found_offset(blkaddr, dirty, pgofs, whence)) { + if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, + pgofs, whence)) { f2fs_put_dnode(&dn); goto found; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2076225787d1..e9cfcdbbe24c 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -68,11 +68,12 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } -static bool __written_first_block(struct f2fs_inode *ri) +static bool __written_first_block(struct f2fs_sb_info *sbi, + struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (is_valid_blkaddr(addr)) + if (is_valid_data_blkaddr(sbi, addr)) return true; return false; } @@ -282,7 +283,7 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); - if (__written_first_block(ri)) + if (__written_first_block(sbi, ri)) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1d590c64bc85..8e58990b9120 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -371,7 +371,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, new_blkaddr == NULL_ADDR); f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) && + f2fs_bug_on(sbi, is_valid_data_blkaddr(sbi, nat_get_blkaddr(e)) && new_blkaddr == NEW_ADDR); /* increment version no as node is removed */ @@ -382,7 +382,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* change address */ nat_set_blkaddr(e, new_blkaddr); - if (!is_valid_blkaddr(new_blkaddr)) + if (!is_valid_data_blkaddr(sbi, new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 38f25f0b193a..3051a5e5dfc7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -252,7 +252,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; page = f2fs_get_tmp_page(sbi, blkaddr); @@ -507,7 +507,7 @@ retry_dn: } /* dest is valid block, try to recover from src to dest */ - if (f2fs_is_valid_meta_blkaddr(sbi, dest, META_POR)) { + if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = f2fs_reserve_new_block(&dn); @@ -568,7 +568,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; f2fs_ra_meta_pages_cond(sbi, blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6dc8828b4d87..654091ec9cfe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1919,7 +1919,7 @@ bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) struct seg_entry *se; bool is_cp = false; - if (!is_valid_blkaddr(blkaddr)) + if (!is_valid_data_blkaddr(sbi, blkaddr)) return true; down_read(&sit_i->sentry_lock); @@ -2993,7 +2993,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; - if (!is_valid_blkaddr(blkaddr)) + if (!is_valid_data_blkaddr(sbi, blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f18fc82fbe99..a7460da9af43 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -85,7 +85,7 @@ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - ((!is_valid_blkaddr(blk_addr)) ? \ + ((!is_valid_data_blkaddr(sbi, blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ @@ -647,11 +647,9 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) if (PAGE_TYPE_OF_BIO(fio->type) == META && (!is_read_io(fio->op) || fio->is_meta)) - BUG_ON(blk_addr < SEG0_BLKADDR(sbi) || - blk_addr >= MAIN_BLKADDR(sbi)); + verify_blkaddr(sbi, blk_addr, META_GENERIC); else - BUG_ON(blk_addr < MAIN_BLKADDR(sbi) || - blk_addr >= MAX_BLKADDR(sbi)); + verify_blkaddr(sbi, blk_addr, DATA_GENERIC); } /* -- cgit v1.2.3-59-g8ed1b From 67fce70ba341f772073cac9c3044aa98c69b24fb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jun 2018 16:06:59 +0800 Subject: f2fs: disable f2fs_check_rb_tree_consistence If there is millions of discard entries cached in rb tree, each sanity check of it can cause very long latency as held cmd_lock blocking other lock grabbers. In other aspect, we have enabled the check very long time, as we see, there is no such inconsistent condition caused by bugs. But still we do not choose to kill it directly, instead, adding an flag to disable the check now, if there is related code change, we can reuse it to detect bugs. Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5e236e26641e..efcb45d0e7d3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -311,6 +311,7 @@ struct discard_cmd_control { atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ + bool rbtree_check; /* config for consistence check */ }; /* for the list of fsync inodes, used only during recovery */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 654091ec9cfe..d63d89287c53 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1199,8 +1199,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; - f2fs_bug_on(sbi, - !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1752,6 +1753,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; dcc->root = RB_ROOT; + dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; @@ -2381,7 +2383,9 @@ next: issued = 0; mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, -- cgit v1.2.3-59-g8ed1b From 42bf546c1fe3f3654bdf914e977acbc2b80a5be5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Jun 2018 00:12:36 +0800 Subject: f2fs: fix to do sanity check with secs_per_zone As Wen Xu reported in below link: https://bugzilla.kernel.org/show_bug.cgi?id=200183 - Overview Divide zero in reset_curseg() when mounting a crafted f2fs image - Reproduce - Kernel message [ 588.281510] divide error: 0000 [#1] SMP KASAN PTI [ 588.282701] CPU: 0 PID: 1293 Comm: mount Not tainted 4.18.0-rc1+ #4 [ 588.284000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 588.286178] RIP: 0010:reset_curseg+0x94/0x1a0 [ 588.298166] RSP: 0018:ffff8801e88d7940 EFLAGS: 00010246 [ 588.299360] RAX: 0000000000000014 RBX: ffff8801e1d46d00 RCX: ffffffffb88bf60b [ 588.300809] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e1d46d64 [ 588.305272] R13: 0000000000000000 R14: 0000000000000014 R15: 0000000000000000 [ 588.306822] FS: 00007fad85008840(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 588.308456] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 588.309623] CR2: 0000000001705078 CR3: 00000001f30f8000 CR4: 00000000000006f0 [ 588.311085] Call Trace: [ 588.311637] f2fs_build_segment_manager+0x103f/0x3410 [ 588.316136] ? f2fs_commit_super+0x1b0/0x1b0 [ 588.317031] ? set_blocksize+0x90/0x140 [ 588.319473] f2fs_mount+0x15/0x20 [ 588.320166] mount_fs+0x60/0x1a0 [ 588.320847] ? alloc_vfsmnt+0x309/0x360 [ 588.321647] vfs_kern_mount+0x6b/0x1a0 [ 588.322432] do_mount+0x34a/0x18c0 [ 588.323175] ? strndup_user+0x46/0x70 [ 588.323937] ? copy_mount_string+0x20/0x20 [ 588.324793] ? memcg_kmem_put_cache+0x1b/0xa0 [ 588.325702] ? kasan_check_write+0x14/0x20 [ 588.326562] ? _copy_from_user+0x6a/0x90 [ 588.327375] ? memdup_user+0x42/0x60 [ 588.328118] ksys_mount+0x83/0xd0 [ 588.328808] __x64_sys_mount+0x67/0x80 [ 588.329607] do_syscall_64+0x78/0x170 [ 588.330400] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 588.331461] RIP: 0033:0x7fad848e8b9a [ 588.336022] RSP: 002b:00007ffd7c5b6be8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 588.337547] RAX: ffffffffffffffda RBX: 00000000016f8030 RCX: 00007fad848e8b9a [ 588.338999] RDX: 00000000016f8210 RSI: 00000000016f9f30 RDI: 0000000001700ec0 [ 588.340442] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 588.341887] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001700ec0 [ 588.343341] R13: 00000000016f8210 R14: 0000000000000000 R15: 0000000000000003 [ 588.354891] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 588.355862] RIP: 0010:reset_curseg+0x94/0x1a0 [ 588.360742] RSP: 0018:ffff8801e88d7940 EFLAGS: 00010246 [ 588.361812] RAX: 0000000000000014 RBX: ffff8801e1d46d00 RCX: ffffffffb88bf60b [ 588.363485] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e1d46d64 [ 588.365213] RBP: ffff8801e88d7968 R08: ffffed003c32266f R09: ffffed003c32266f [ 588.366661] R10: 0000000000000001 R11: ffffed003c32266e R12: ffff8801f0337700 [ 588.368110] R13: 0000000000000000 R14: 0000000000000014 R15: 0000000000000000 [ 588.370057] FS: 00007fad85008840(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 588.372099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 588.373291] CR2: 0000000001705078 CR3: 00000001f30f8000 CR4: 00000000000006f0 - Location https://elixir.bootlin.com/linux/latest/source/fs/f2fs/segment.c#L2147 curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); If secs_per_zone is corrupted due to fuzzing test, it will cause divide zero operation when using GET_ZONE_FROM_SEG macro, so we should do more sanity check with secs_per_zone during mount to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 01d1cb6081fc..a041ee20492d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2227,9 +2227,9 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (secs_per_zone > total_sections) { + if (secs_per_zone > total_sections || !secs_per_zone) { f2fs_msg(sb, KERN_INFO, - "Wrong secs_per_zone (%u > %u)", + "Wrong secs_per_zone / total_sections (%u, %u)", secs_per_zone, total_sections); return 1; } -- cgit v1.2.3-59-g8ed1b From c77ec61ca0a49544ca81881cc5d5529858f7e196 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Jun 2018 11:25:19 +0800 Subject: f2fs: fix to do sanity check with {sit,nat}_ver_bitmap_bytesize This patch adds to do sanity check with {sit,nat}_ver_bitmap_bytesize during mount, in order to avoid accessing across cache boundary with this abnormal bitmap size. - Overview buffer overrun in build_sit_info() when mounting a crafted f2fs image - Reproduce - Kernel message [ 548.580867] F2FS-fs (loop0): Invalid log blocks per segment (8201) [ 548.580877] F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock [ 548.584979] ================================================================== [ 548.586568] BUG: KASAN: use-after-free in kmemdup+0x36/0x50 [ 548.587715] Read of size 64 at addr ffff8801e9c265ff by task mount/1295 [ 548.589428] CPU: 1 PID: 1295 Comm: mount Not tainted 4.18.0-rc1+ #4 [ 548.589432] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 548.589438] Call Trace: [ 548.589474] dump_stack+0x7b/0xb5 [ 548.589487] print_address_description+0x70/0x290 [ 548.589492] kasan_report+0x291/0x390 [ 548.589496] ? kmemdup+0x36/0x50 [ 548.589509] check_memory_region+0x139/0x190 [ 548.589514] memcpy+0x23/0x50 [ 548.589518] kmemdup+0x36/0x50 [ 548.589545] f2fs_build_segment_manager+0x8fa/0x3410 [ 548.589551] ? __asan_loadN+0xf/0x20 [ 548.589560] ? f2fs_sanity_check_ckpt+0x1be/0x240 [ 548.589566] ? f2fs_flush_sit_entries+0x10c0/0x10c0 [ 548.589587] ? __put_user_ns+0x40/0x40 [ 548.589604] ? find_next_bit+0x57/0x90 [ 548.589610] f2fs_fill_super+0x194b/0x2b40 [ 548.589617] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.589637] ? set_blocksize+0x90/0x140 [ 548.589651] mount_bdev+0x1c5/0x210 [ 548.589655] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.589667] f2fs_mount+0x15/0x20 [ 548.589672] mount_fs+0x60/0x1a0 [ 548.589683] ? alloc_vfsmnt+0x309/0x360 [ 548.589688] vfs_kern_mount+0x6b/0x1a0 [ 548.589699] do_mount+0x34a/0x18c0 [ 548.589710] ? lockref_put_or_lock+0xcf/0x160 [ 548.589716] ? copy_mount_string+0x20/0x20 [ 548.589728] ? memcg_kmem_put_cache+0x1b/0xa0 [ 548.589734] ? kasan_check_write+0x14/0x20 [ 548.589740] ? _copy_from_user+0x6a/0x90 [ 548.589744] ? memdup_user+0x42/0x60 [ 548.589750] ksys_mount+0x83/0xd0 [ 548.589755] __x64_sys_mount+0x67/0x80 [ 548.589781] do_syscall_64+0x78/0x170 [ 548.589797] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.589820] RIP: 0033:0x7f76fc331b9a [ 548.589821] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 548.589880] RSP: 002b:00007ffd4f0a0e48 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 548.589890] RAX: ffffffffffffffda RBX: 000000000146c030 RCX: 00007f76fc331b9a [ 548.589892] RDX: 000000000146c210 RSI: 000000000146df30 RDI: 0000000001474ec0 [ 548.589895] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 548.589897] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001474ec0 [ 548.589900] R13: 000000000146c210 R14: 0000000000000000 R15: 0000000000000003 [ 548.590242] The buggy address belongs to the page: [ 548.591243] page:ffffea0007a70980 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 548.592886] flags: 0x2ffff0000000000() [ 548.593665] raw: 02ffff0000000000 dead000000000100 dead000000000200 0000000000000000 [ 548.595258] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 548.603713] page dumped because: kasan: bad access detected [ 548.605203] Memory state around the buggy address: [ 548.606198] ffff8801e9c26480: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.607676] ffff8801e9c26500: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.609157] >ffff8801e9c26580: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.610629] ^ [ 548.612088] ffff8801e9c26600: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.613674] ffff8801e9c26680: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 548.615141] ================================================================== [ 548.616613] Disabling lock debugging due to kernel taint [ 548.622871] WARNING: CPU: 1 PID: 1295 at mm/page_alloc.c:4065 __alloc_pages_slowpath+0xe4a/0x1420 [ 548.622878] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 548.623217] CPU: 1 PID: 1295 Comm: mount Tainted: G B 4.18.0-rc1+ #4 [ 548.623219] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 548.623226] RIP: 0010:__alloc_pages_slowpath+0xe4a/0x1420 [ 548.623227] Code: ff ff 01 89 85 c8 fe ff ff e9 91 fc ff ff 41 89 c5 e9 5c fc ff ff 0f 0b 89 f8 25 ff ff f7 ff 89 85 8c fe ff ff e9 d5 f2 ff ff <0f> 0b e9 65 f2 ff ff 65 8b 05 38 81 d2 47 f6 c4 01 74 1c 65 48 8b [ 548.623281] RSP: 0018:ffff8801f28c7678 EFLAGS: 00010246 [ 548.623284] RAX: 0000000000000000 RBX: 00000000006040c0 RCX: ffffffffb82f73b7 [ 548.623287] RDX: 1ffff1003e518eeb RSI: 000000000000000c RDI: 0000000000000000 [ 548.623290] RBP: ffff8801f28c7880 R08: 0000000000000000 R09: ffffed0047fff2c5 [ 548.623292] R10: 0000000000000001 R11: ffffed0047fff2c4 R12: ffff8801e88de040 [ 548.623295] R13: 00000000006040c0 R14: 000000000000000c R15: ffff8801f28c7938 [ 548.623299] FS: 00007f76fca51840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 548.623302] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 548.623304] CR2: 00007f19b9171760 CR3: 00000001ed952000 CR4: 00000000000006e0 [ 548.623317] Call Trace: [ 548.623325] ? kasan_check_read+0x11/0x20 [ 548.623330] ? __zone_watermark_ok+0x92/0x240 [ 548.623336] ? get_page_from_freelist+0x1c3/0x1d90 [ 548.623347] ? _raw_spin_lock_irqsave+0x2a/0x60 [ 548.623353] ? warn_alloc+0x250/0x250 [ 548.623358] ? save_stack+0x46/0xd0 [ 548.623361] ? kasan_kmalloc+0xad/0xe0 [ 548.623366] ? __isolate_free_page+0x2a0/0x2a0 [ 548.623370] ? mount_fs+0x60/0x1a0 [ 548.623374] ? vfs_kern_mount+0x6b/0x1a0 [ 548.623378] ? do_mount+0x34a/0x18c0 [ 548.623383] ? ksys_mount+0x83/0xd0 [ 548.623387] ? __x64_sys_mount+0x67/0x80 [ 548.623391] ? do_syscall_64+0x78/0x170 [ 548.623396] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.623401] __alloc_pages_nodemask+0x3c5/0x400 [ 548.623407] ? __alloc_pages_slowpath+0x1420/0x1420 [ 548.623412] ? __mutex_lock_slowpath+0x20/0x20 [ 548.623417] ? kvmalloc_node+0x31/0x80 [ 548.623424] alloc_pages_current+0x75/0x110 [ 548.623436] kmalloc_order+0x24/0x60 [ 548.623442] kmalloc_order_trace+0x24/0xb0 [ 548.623448] __kmalloc_track_caller+0x207/0x220 [ 548.623455] ? f2fs_build_node_manager+0x399/0xbb0 [ 548.623460] kmemdup+0x20/0x50 [ 548.623465] f2fs_build_node_manager+0x399/0xbb0 [ 548.623470] f2fs_fill_super+0x195e/0x2b40 [ 548.623477] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.623481] ? set_blocksize+0x90/0x140 [ 548.623486] mount_bdev+0x1c5/0x210 [ 548.623489] ? f2fs_commit_super+0x1b0/0x1b0 [ 548.623495] f2fs_mount+0x15/0x20 [ 548.623498] mount_fs+0x60/0x1a0 [ 548.623503] ? alloc_vfsmnt+0x309/0x360 [ 548.623508] vfs_kern_mount+0x6b/0x1a0 [ 548.623513] do_mount+0x34a/0x18c0 [ 548.623518] ? lockref_put_or_lock+0xcf/0x160 [ 548.623523] ? copy_mount_string+0x20/0x20 [ 548.623528] ? memcg_kmem_put_cache+0x1b/0xa0 [ 548.623533] ? kasan_check_write+0x14/0x20 [ 548.623537] ? _copy_from_user+0x6a/0x90 [ 548.623542] ? memdup_user+0x42/0x60 [ 548.623547] ksys_mount+0x83/0xd0 [ 548.623552] __x64_sys_mount+0x67/0x80 [ 548.623557] do_syscall_64+0x78/0x170 [ 548.623562] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 548.623566] RIP: 0033:0x7f76fc331b9a [ 548.623567] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 548.623632] RSP: 002b:00007ffd4f0a0e48 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 548.623636] RAX: ffffffffffffffda RBX: 000000000146c030 RCX: 00007f76fc331b9a [ 548.623639] RDX: 000000000146c210 RSI: 000000000146df30 RDI: 0000000001474ec0 [ 548.623641] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 548.623643] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000000001474ec0 [ 548.623646] R13: 000000000146c210 R14: 0000000000000000 R15: 0000000000000003 [ 548.623650] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 548.623656] F2FS-fs (loop0): Failed to initialize F2FS node manager [ 548.627936] F2FS-fs (loop0): Invalid log blocks per segment (8201) [ 548.627940] F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock [ 548.635835] F2FS-fs (loop0): Failed to initialize F2FS node manager - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.c#L3578 sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); Buffer overrun happens when doing memcpy. I suspect there is missing (inconsistent) checks on bitmap_size. Reported by Wen Xu (wen.xu@gatech.edu) from SSLab, Gatech. Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a041ee20492d..7fb51885a240 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2280,12 +2280,17 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; unsigned int main_segs, blocks_per_seg; + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); - fsmeta += le32_to_cpu(raw_super->segment_count_sit); - fsmeta += le32_to_cpu(raw_super->segment_count_nat); + sit_segs = le32_to_cpu(raw_super->segment_count_sit); + fsmeta += sit_segs; + nat_segs = le32_to_cpu(raw_super->segment_count_nat); + fsmeta += nat_segs; fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); @@ -2316,6 +2321,18 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong bitmap size: sit: %u, nat:%u", + sit_bitmap_size, nat_bitmap_size); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; -- cgit v1.2.3-59-g8ed1b From 01f9cf6db70f97d92d5e601ad397921014a999ca Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Jun 2018 20:33:24 +0800 Subject: f2fs: fix to correct return value of f2fs_trim_fs We should account trimmed block number from __wait_all_discard_cmd in __issue_discard_cmd_range, otherwise trimmed blocks returned by f2fs_trim_fs will be wrong, this patch fixes it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d63d89287c53..47b6595a078c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1320,21 +1320,22 @@ next: return trimmed; } -static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_policy dp; + unsigned int discard_blks; - if (dpolicy) { - __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); - return; - } + if (dpolicy) + return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); - __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); - __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + + return discard_blks; } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -2368,7 +2369,7 @@ bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, return has_candidate; } -static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, +static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, unsigned int start, unsigned int end) { @@ -2378,6 +2379,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct blk_plug plug; int issued; + unsigned int trimmed = 0; next: issued = 0; @@ -2415,7 +2417,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); - __wait_all_discard_cmd(sbi, NULL); + trimmed += __wait_all_discard_cmd(sbi, NULL); congestion_wait(BLK_RW_ASYNC, HZ/50); goto next; } @@ -2429,6 +2431,8 @@ skip: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + return trimmed; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) @@ -2486,9 +2490,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); - __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + trimmed = __issue_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); range->len = F2FS_BLK_TO_BYTES(trimmed); out: -- cgit v1.2.3-59-g8ed1b From 76d56d4ab4f2a9e4f085c7d77172194ddaccf7d2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Jun 2018 23:29:49 +0800 Subject: f2fs: fix to do sanity check with extra_attr feature If FI_EXTRA_ATTR is set in inode by fuzzing, inode.i_addr[0] will be parsed as inode.i_extra_isize, then in __recover_inline_status, inline data address will beyond boundary of page, result in accessing invalid memory. So in this condition, during reading inode page, let's do sanity check with EXTRA_ATTR feature of fs and extra_attr bit of inode, if they're inconsistent, deny to load this inode. - Overview Out-of-bound access in f2fs_iget() when mounting a corrupted f2fs image - Reproduce The following message will be got in KASAN build of 4.18 upstream kernel. [ 819.392227] ================================================================== [ 819.393901] BUG: KASAN: slab-out-of-bounds in f2fs_iget+0x736/0x1530 [ 819.395329] Read of size 4 at addr ffff8801f099c968 by task mount/1292 [ 819.397079] CPU: 1 PID: 1292 Comm: mount Not tainted 4.18.0-rc1+ #4 [ 819.397082] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 819.397088] Call Trace: [ 819.397124] dump_stack+0x7b/0xb5 [ 819.397154] print_address_description+0x70/0x290 [ 819.397159] kasan_report+0x291/0x390 [ 819.397163] ? f2fs_iget+0x736/0x1530 [ 819.397176] check_memory_region+0x139/0x190 [ 819.397182] __asan_loadN+0xf/0x20 [ 819.397185] f2fs_iget+0x736/0x1530 [ 819.397197] f2fs_fill_super+0x1b4f/0x2b40 [ 819.397202] ? f2fs_fill_super+0x1b4f/0x2b40 [ 819.397208] ? f2fs_commit_super+0x1b0/0x1b0 [ 819.397227] ? set_blocksize+0x90/0x140 [ 819.397241] mount_bdev+0x1c5/0x210 [ 819.397245] ? f2fs_commit_super+0x1b0/0x1b0 [ 819.397252] f2fs_mount+0x15/0x20 [ 819.397256] mount_fs+0x60/0x1a0 [ 819.397267] ? alloc_vfsmnt+0x309/0x360 [ 819.397272] vfs_kern_mount+0x6b/0x1a0 [ 819.397282] do_mount+0x34a/0x18c0 [ 819.397300] ? lockref_put_or_lock+0xcf/0x160 [ 819.397306] ? copy_mount_string+0x20/0x20 [ 819.397318] ? memcg_kmem_put_cache+0x1b/0xa0 [ 819.397324] ? kasan_check_write+0x14/0x20 [ 819.397334] ? _copy_from_user+0x6a/0x90 [ 819.397353] ? memdup_user+0x42/0x60 [ 819.397359] ksys_mount+0x83/0xd0 [ 819.397365] __x64_sys_mount+0x67/0x80 [ 819.397388] do_syscall_64+0x78/0x170 [ 819.397403] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 819.397422] RIP: 0033:0x7f54c667cb9a [ 819.397424] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 819.397483] RSP: 002b:00007ffd8f46cd08 EFLAGS: 00000202 ORIG_RAX: 00000000000000a5 [ 819.397496] RAX: ffffffffffffffda RBX: 0000000000dfa030 RCX: 00007f54c667cb9a [ 819.397498] RDX: 0000000000dfa210 RSI: 0000000000dfbf30 RDI: 0000000000e02ec0 [ 819.397501] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 819.397503] R10: 00000000c0ed0000 R11: 0000000000000202 R12: 0000000000e02ec0 [ 819.397505] R13: 0000000000dfa210 R14: 0000000000000000 R15: 0000000000000003 [ 819.397866] Allocated by task 139: [ 819.398702] save_stack+0x46/0xd0 [ 819.398705] kasan_kmalloc+0xad/0xe0 [ 819.398709] kasan_slab_alloc+0x11/0x20 [ 819.398713] kmem_cache_alloc+0xd1/0x1e0 [ 819.398717] dup_fd+0x50/0x4c0 [ 819.398740] copy_process.part.37+0xbed/0x32e0 [ 819.398744] _do_fork+0x16e/0x590 [ 819.398748] __x64_sys_clone+0x69/0x80 [ 819.398752] do_syscall_64+0x78/0x170 [ 819.398756] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 819.399097] Freed by task 159: [ 819.399743] save_stack+0x46/0xd0 [ 819.399747] __kasan_slab_free+0x13c/0x1a0 [ 819.399750] kasan_slab_free+0xe/0x10 [ 819.399754] kmem_cache_free+0x89/0x1e0 [ 819.399757] put_files_struct+0x132/0x150 [ 819.399761] exit_files+0x62/0x70 [ 819.399766] do_exit+0x47b/0x1390 [ 819.399770] do_group_exit+0x86/0x130 [ 819.399774] __x64_sys_exit_group+0x2c/0x30 [ 819.399778] do_syscall_64+0x78/0x170 [ 819.399782] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 819.400115] The buggy address belongs to the object at ffff8801f099c680 which belongs to the cache files_cache of size 704 [ 819.403234] The buggy address is located 40 bytes to the right of 704-byte region [ffff8801f099c680, ffff8801f099c940) [ 819.405689] The buggy address belongs to the page: [ 819.406709] page:ffffea0007c26700 count:1 mapcount:0 mapping:ffff8801f69a3340 index:0xffff8801f099d380 compound_mapcount: 0 [ 819.408984] flags: 0x2ffff0000008100(slab|head) [ 819.409932] raw: 02ffff0000008100 ffffea00077fb600 0000000200000002 ffff8801f69a3340 [ 819.411514] raw: ffff8801f099d380 0000000080130000 00000001ffffffff 0000000000000000 [ 819.413073] page dumped because: kasan: bad access detected [ 819.414539] Memory state around the buggy address: [ 819.415521] ffff8801f099c800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 819.416981] ffff8801f099c880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 819.418454] >ffff8801f099c900: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 819.419921] ^ [ 819.421265] ffff8801f099c980: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb [ 819.422745] ffff8801f099ca00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 819.424206] ================================================================== [ 819.425668] Disabling lock debugging due to kernel taint [ 819.457463] F2FS-fs (loop0): Mounted with checkpoint version = 3 The kernel still mounts the image. If you run the following program on the mounted folder mnt, (poc.c) static void activity(char *mpoint) { char *foo_bar_baz; int err; static int buf[8192]; memset(buf, 0, sizeof(buf)); err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); int fd = open(foo_bar_baz, O_RDONLY, 0); if (fd >= 0) { read(fd, (char *)buf, 11); close(fd); } } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } You can get kernel crash: [ 819.457463] F2FS-fs (loop0): Mounted with checkpoint version = 3 [ 918.028501] BUG: unable to handle kernel paging request at ffffed0048000d82 [ 918.044020] PGD 23ffee067 P4D 23ffee067 PUD 23fbef067 PMD 0 [ 918.045207] Oops: 0000 [#1] SMP KASAN PTI [ 918.046048] CPU: 0 PID: 1309 Comm: poc Tainted: G B 4.18.0-rc1+ #4 [ 918.047573] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 918.049552] RIP: 0010:check_memory_region+0x5e/0x190 [ 918.050565] Code: f8 49 c1 e8 03 49 89 db 49 c1 eb 03 4d 01 cb 4d 01 c1 4d 8d 63 01 4c 89 c8 4d 89 e2 4d 29 ca 49 83 fa 10 7f 3d 4d 85 d2 74 32 <41> 80 39 00 75 23 48 b8 01 00 00 00 00 fc ff df 4d 01 d1 49 01 c0 [ 918.054322] RSP: 0018:ffff8801e3a1f258 EFLAGS: 00010202 [ 918.055400] RAX: ffffed0048000d82 RBX: ffff880240006c11 RCX: ffffffffb8867d14 [ 918.056832] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff880240006c10 [ 918.058253] RBP: ffff8801e3a1f268 R08: 1ffff10048000d82 R09: ffffed0048000d82 [ 918.059717] R10: 0000000000000001 R11: ffffed0048000d82 R12: ffffed0048000d83 [ 918.061159] R13: ffff8801e3a1f390 R14: 0000000000000000 R15: ffff880240006c08 [ 918.062614] FS: 00007fac9732c700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 918.064246] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 918.065412] CR2: ffffed0048000d82 CR3: 00000001df77a000 CR4: 00000000000006f0 [ 918.066882] Call Trace: [ 918.067410] __asan_loadN+0xf/0x20 [ 918.068149] f2fs_find_target_dentry+0xf4/0x270 [ 918.069083] ? __get_node_page+0x331/0x5b0 [ 918.069925] f2fs_find_in_inline_dir+0x24b/0x310 [ 918.070881] ? f2fs_recover_inline_data+0x4c0/0x4c0 [ 918.071905] ? unwind_next_frame.part.5+0x34f/0x490 [ 918.072901] ? unwind_dump+0x290/0x290 [ 918.073695] ? is_bpf_text_address+0xe/0x20 [ 918.074566] __f2fs_find_entry+0x599/0x670 [ 918.075408] ? kasan_unpoison_shadow+0x36/0x50 [ 918.076315] ? kasan_kmalloc+0xad/0xe0 [ 918.077100] ? memcg_kmem_put_cache+0x55/0xa0 [ 918.077998] ? f2fs_find_target_dentry+0x270/0x270 [ 918.079006] ? d_set_d_op+0x30/0x100 [ 918.079749] ? __d_lookup_rcu+0x69/0x2e0 [ 918.080556] ? __d_alloc+0x275/0x450 [ 918.081297] ? kasan_check_write+0x14/0x20 [ 918.082135] ? memset+0x31/0x40 [ 918.082820] ? fscrypt_setup_filename+0x1ec/0x4c0 [ 918.083782] ? d_alloc_parallel+0x5bb/0x8c0 [ 918.084640] f2fs_find_entry+0xe9/0x110 [ 918.085432] ? __f2fs_find_entry+0x670/0x670 [ 918.086308] ? kasan_check_write+0x14/0x20 [ 918.087163] f2fs_lookup+0x297/0x590 [ 918.087902] ? f2fs_link+0x2b0/0x2b0 [ 918.088646] ? legitimize_path.isra.29+0x61/0xa0 [ 918.089589] __lookup_slow+0x12e/0x240 [ 918.090371] ? may_delete+0x2b0/0x2b0 [ 918.091123] ? __nd_alloc_stack+0xa0/0xa0 [ 918.091944] lookup_slow+0x44/0x60 [ 918.092642] walk_component+0x3ee/0xa40 [ 918.093428] ? is_bpf_text_address+0xe/0x20 [ 918.094283] ? pick_link+0x3e0/0x3e0 [ 918.095047] ? in_group_p+0xa5/0xe0 [ 918.095771] ? generic_permission+0x53/0x1e0 [ 918.096666] ? security_inode_permission+0x1d/0x70 [ 918.097646] ? inode_permission+0x7a/0x1f0 [ 918.098497] link_path_walk+0x2a2/0x7b0 [ 918.099298] ? apparmor_capget+0x3d0/0x3d0 [ 918.100140] ? walk_component+0xa40/0xa40 [ 918.100958] ? path_init+0x2e6/0x580 [ 918.101695] path_openat+0x1bb/0x2160 [ 918.102471] ? __save_stack_trace+0x92/0x100 [ 918.103352] ? save_stack+0xb5/0xd0 [ 918.104070] ? vfs_unlink+0x250/0x250 [ 918.104822] ? save_stack+0x46/0xd0 [ 918.105538] ? kasan_slab_alloc+0x11/0x20 [ 918.106370] ? kmem_cache_alloc+0xd1/0x1e0 [ 918.107213] ? getname_flags+0x76/0x2c0 [ 918.107997] ? getname+0x12/0x20 [ 918.108677] ? do_sys_open+0x14b/0x2c0 [ 918.109450] ? __x64_sys_open+0x4c/0x60 [ 918.110255] ? do_syscall_64+0x78/0x170 [ 918.111083] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 918.112148] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 918.113204] ? f2fs_empty_inline_dir+0x1e0/0x1e0 [ 918.114150] ? timespec64_trunc+0x5c/0x90 [ 918.114993] ? wb_io_lists_depopulated+0x1a/0xc0 [ 918.115937] ? inode_io_list_move_locked+0x102/0x110 [ 918.116949] do_filp_open+0x12b/0x1d0 [ 918.117709] ? may_open_dev+0x50/0x50 [ 918.118475] ? kasan_kmalloc+0xad/0xe0 [ 918.119246] do_sys_open+0x17c/0x2c0 [ 918.119983] ? do_sys_open+0x17c/0x2c0 [ 918.120751] ? filp_open+0x60/0x60 [ 918.121463] ? task_work_run+0x4d/0xf0 [ 918.122237] __x64_sys_open+0x4c/0x60 [ 918.123001] do_syscall_64+0x78/0x170 [ 918.123759] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 918.124802] RIP: 0033:0x7fac96e3e040 [ 918.125537] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 09 27 2d 00 00 75 10 b8 02 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 7e e0 01 00 48 89 04 24 [ 918.129341] RSP: 002b:00007fff1b37f848 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 [ 918.130870] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fac96e3e040 [ 918.132295] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000122d080 [ 918.133748] RBP: 00007fff1b37f9b0 R08: 00007fac9710bbd8 R09: 0000000000000001 [ 918.135209] R10: 000000000000069d R11: 0000000000000246 R12: 0000000000400c20 [ 918.136650] R13: 00007fff1b37fab0 R14: 0000000000000000 R15: 0000000000000000 [ 918.138093] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 918.147924] CR2: ffffed0048000d82 [ 918.148619] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 918.149563] RIP: 0010:check_memory_region+0x5e/0x190 [ 918.150576] Code: f8 49 c1 e8 03 49 89 db 49 c1 eb 03 4d 01 cb 4d 01 c1 4d 8d 63 01 4c 89 c8 4d 89 e2 4d 29 ca 49 83 fa 10 7f 3d 4d 85 d2 74 32 <41> 80 39 00 75 23 48 b8 01 00 00 00 00 fc ff df 4d 01 d1 49 01 c0 [ 918.154360] RSP: 0018:ffff8801e3a1f258 EFLAGS: 00010202 [ 918.155411] RAX: ffffed0048000d82 RBX: ffff880240006c11 RCX: ffffffffb8867d14 [ 918.156833] RDX: 0000000000000000 RSI: 0000000000000002 RDI: ffff880240006c10 [ 918.158257] RBP: ffff8801e3a1f268 R08: 1ffff10048000d82 R09: ffffed0048000d82 [ 918.159722] R10: 0000000000000001 R11: ffffed0048000d82 R12: ffffed0048000d83 [ 918.161149] R13: ffff8801e3a1f390 R14: 0000000000000000 R15: ffff880240006c08 [ 918.162587] FS: 00007fac9732c700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 918.164203] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 918.165356] CR2: ffffed0048000d82 CR3: 00000001df77a000 CR4: 00000000000006f0 Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e9cfcdbbe24c..22810d30c054 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -205,6 +205,16 @@ static bool sanity_check_inode(struct inode *inode) __func__, inode->i_ino); return false; } + + if (f2fs_has_extra_attr(inode) && + !f2fs_sb_has_extra_attr(sbi->sb)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) is with extra_attr, " + "but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } return true; } @@ -257,6 +267,11 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + if (!sanity_check_inode(inode)) { + f2fs_put_page(node_page, 1); + return -EINVAL; + } + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; @@ -338,10 +353,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = do_read_inode(inode); if (ret) goto bad_inode; - if (!sanity_check_inode(inode)) { - ret = -EINVAL; - goto bad_inode; - } make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; -- cgit v1.2.3-59-g8ed1b From 9dc956b2c8523aed39d1e6508438be9fea28c8fc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Jun 2018 18:05:54 +0800 Subject: f2fs: fix to do sanity check with user_block_count This patch fixs to do sanity check with user_block_count. - Overview Divide zero in utilization when mount() a corrupted f2fs image - Reproduce (4.18 upstream kernel) - Kernel message [ 564.099503] F2FS-fs (loop0): invalid crc value [ 564.101991] divide error: 0000 [#1] SMP KASAN PTI [ 564.103103] CPU: 1 PID: 1298 Comm: f2fs_discard-7: Not tainted 4.18.0-rc1+ #4 [ 564.104584] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 564.106624] RIP: 0010:issue_discard_thread+0x248/0x5c0 [ 564.107692] Code: ff ff 48 8b bd e8 fe ff ff 41 8b 9d 4c 04 00 00 e8 cd b8 ad ff 41 8b 85 50 04 00 00 31 d2 48 8d 04 80 48 8d 04 80 48 c1 e0 02 <48> f7 f3 83 f8 50 7e 16 41 c7 86 7c ff ff ff 01 00 00 00 41 c7 86 [ 564.111686] RSP: 0018:ffff8801f3117dc0 EFLAGS: 00010206 [ 564.112775] RAX: 0000000000000384 RBX: 0000000000000000 RCX: ffffffffb88c1e03 [ 564.114250] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e3aa4850 [ 564.115706] RBP: ffff8801f3117f00 R08: 1ffffffff751a1d0 R09: fffffbfff751a1d0 [ 564.117177] R10: 0000000000000001 R11: fffffbfff751a1d0 R12: 00000000fffffffc [ 564.118634] R13: ffff8801e3aa4400 R14: ffff8801f3117ed8 R15: ffff8801e2050000 [ 564.120094] FS: 0000000000000000(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 564.121748] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 564.122923] CR2: 000000000202b078 CR3: 00000001f11ac000 CR4: 00000000000006e0 [ 564.124383] Call Trace: [ 564.124924] ? __issue_discard_cmd+0x480/0x480 [ 564.125882] ? __sched_text_start+0x8/0x8 [ 564.126756] ? __kthread_parkme+0xcb/0x100 [ 564.127620] ? kthread_blkcg+0x70/0x70 [ 564.128412] kthread+0x180/0x1d0 [ 564.129105] ? __issue_discard_cmd+0x480/0x480 [ 564.130029] ? kthread_associate_blkcg+0x150/0x150 [ 564.131033] ret_from_fork+0x35/0x40 [ 564.131794] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 564.141798] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 564.142773] RIP: 0010:issue_discard_thread+0x248/0x5c0 [ 564.143885] Code: ff ff 48 8b bd e8 fe ff ff 41 8b 9d 4c 04 00 00 e8 cd b8 ad ff 41 8b 85 50 04 00 00 31 d2 48 8d 04 80 48 8d 04 80 48 c1 e0 02 <48> f7 f3 83 f8 50 7e 16 41 c7 86 7c ff ff ff 01 00 00 00 41 c7 86 [ 564.147776] RSP: 0018:ffff8801f3117dc0 EFLAGS: 00010206 [ 564.148856] RAX: 0000000000000384 RBX: 0000000000000000 RCX: ffffffffb88c1e03 [ 564.150424] RDX: 0000000000000000 RSI: dffffc0000000000 RDI: ffff8801e3aa4850 [ 564.151906] RBP: ffff8801f3117f00 R08: 1ffffffff751a1d0 R09: fffffbfff751a1d0 [ 564.153463] R10: 0000000000000001 R11: fffffbfff751a1d0 R12: 00000000fffffffc [ 564.154915] R13: ffff8801e3aa4400 R14: ffff8801f3117ed8 R15: ffff8801e2050000 [ 564.156405] FS: 0000000000000000(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 564.158070] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 564.159279] CR2: 000000000202b078 CR3: 00000001f11ac000 CR4: 00000000000006e0 [ 564.161043] ================================================================== [ 564.162587] BUG: KASAN: stack-out-of-bounds in from_kuid_munged+0x1d/0x50 [ 564.163994] Read of size 4 at addr ffff8801f3117c84 by task f2fs_discard-7:/1298 [ 564.165852] CPU: 1 PID: 1298 Comm: f2fs_discard-7: Tainted: G D 4.18.0-rc1+ #4 [ 564.167593] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 564.169522] Call Trace: [ 564.170057] dump_stack+0x7b/0xb5 [ 564.170778] print_address_description+0x70/0x290 [ 564.171765] kasan_report+0x291/0x390 [ 564.172540] ? from_kuid_munged+0x1d/0x50 [ 564.173408] __asan_load4+0x78/0x80 [ 564.174148] from_kuid_munged+0x1d/0x50 [ 564.174962] do_notify_parent+0x1f5/0x4f0 [ 564.175808] ? send_sigqueue+0x390/0x390 [ 564.176639] ? css_set_move_task+0x152/0x340 [ 564.184197] do_exit+0x1290/0x1390 [ 564.184950] ? __issue_discard_cmd+0x480/0x480 [ 564.185884] ? mm_update_next_owner+0x380/0x380 [ 564.186829] ? __sched_text_start+0x8/0x8 [ 564.187672] ? __kthread_parkme+0xcb/0x100 [ 564.188528] ? kthread_blkcg+0x70/0x70 [ 564.189333] ? kthread+0x180/0x1d0 [ 564.190052] ? __issue_discard_cmd+0x480/0x480 [ 564.190983] rewind_stack_do_exit+0x17/0x20 [ 564.192190] The buggy address belongs to the page: [ 564.193213] page:ffffea0007cc45c0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 564.194856] flags: 0x2ffff0000000000() [ 564.195644] raw: 02ffff0000000000 0000000000000000 dead000000000200 0000000000000000 [ 564.197247] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 564.198826] page dumped because: kasan: bad access detected [ 564.200299] Memory state around the buggy address: [ 564.201306] ffff8801f3117b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 564.202779] ffff8801f3117c00: 00 00 00 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3 [ 564.204252] >ffff8801f3117c80: f3 f3 f3 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 [ 564.205742] ^ [ 564.206424] ffff8801f3117d00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 564.207908] ffff8801f3117d80: f3 f3 f3 f3 f3 f3 f3 f3 00 00 00 00 00 00 00 00 [ 564.209389] ================================================================== [ 564.231795] F2FS-fs (loop0): Mounted with checkpoint version = 2 - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L586 return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); Missing checks on sbi->user_block_count. Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7fb51885a240..4d0de436bbee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2283,6 +2283,8 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int sit_segs, nat_segs; unsigned int sit_bitmap_size, nat_bitmap_size; unsigned int log_blocks_per_seg; + unsigned int segment_count_main; + block_t user_block_count; int i; total = le32_to_cpu(raw_super->segment_count); @@ -2307,6 +2309,16 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + user_block_count = le64_to_cpu(ckpt->user_block_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + if (!user_block_count || user_block_count >= + segment_count_main << log_blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong user_block_count: %u", user_block_count); + return 1; + } + main_segs = le32_to_cpu(raw_super->segment_count_main); blocks_per_seg = sbi->blocks_per_seg; @@ -2323,7 +2335,6 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); - log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { -- cgit v1.2.3-59-g8ed1b From e15d54d5009688ccb2a5312f3b70d631615329c9 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 27 Jun 2018 14:46:21 +0800 Subject: f2fs: Allocate and stat mem used by free nid bitmap more accurately This patch used f2fs_bitmap_size macro to calculate mem used by free nid bitmap, and stat used mem including aligned part. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 3 ++- fs/f2fs/node.c | 2 +- include/linux/f2fs_fs.h | 5 ----- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 2d65e77ae5cf..214a968962a1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -215,7 +215,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); - si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks * + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK); si->base_mem += NM_I(sbi)->nat_blocks / 8; si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8e58990b9120..142b34130749 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2797,7 +2797,7 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, - NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL); + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index aa5db8b5521a..f70f8ac9c4f4 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -304,11 +304,6 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) -#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) -#define NAT_ENTRY_BITMAP_SIZE_ALIGNED \ - ((NAT_ENTRY_BITMAP_SIZE + BITS_PER_LONG - 1) / \ - BITS_PER_LONG * BITS_PER_LONG) - struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ -- cgit v1.2.3-59-g8ed1b From e34438c903b653daca2b2a7de95aed46226f8ed3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Jun 2018 13:55:22 +0800 Subject: f2fs: fix to do sanity check with node footer and iblocks This patch adds to do sanity check with below fields of inode to avoid reported panic. - node footer - iblocks https://bugzilla.kernel.org/show_bug.cgi?id=200223 - Overview BUG() triggered in f2fs_truncate_inode_blocks() when un-mounting a mounted f2fs image after writing to it - Reproduce - POC (poc.c) static void activity(char *mpoint) { char *foo_bar_baz; int err; static int buf[8192]; memset(buf, 0, sizeof(buf)); err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); // open / write / read int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777); if (fd >= 0) { write(fd, (char *)buf, 517); write(fd, (char *)buf, sizeof(buf)); close(fd); } } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - Kernel meesage [ 552.479723] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 556.451891] ------------[ cut here ]------------ [ 556.451899] kernel BUG at fs/f2fs/node.c:987! [ 556.452920] invalid opcode: 0000 [#1] SMP KASAN PTI [ 556.453936] CPU: 1 PID: 1310 Comm: umount Not tainted 4.18.0-rc1+ #4 [ 556.455213] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 556.457140] RIP: 0010:f2fs_truncate_inode_blocks+0x4a7/0x6f0 [ 556.458280] Code: e8 ae ea ff ff 41 89 c7 c1 e8 1f 84 c0 74 0a 41 83 ff fe 0f 85 35 ff ff ff 81 85 b0 fe ff ff fb 03 00 00 e9 f7 fd ff ff 0f 0b <0f> 0b e8 62 b7 9a 00 48 8b bd a0 fe ff ff e8 56 54 ae ff 48 8b b5 [ 556.462015] RSP: 0018:ffff8801f292f808 EFLAGS: 00010286 [ 556.463068] RAX: ffffed003e73242d RBX: ffff8801f292f958 RCX: ffffffffb88b81bc [ 556.464479] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff8801f3992164 [ 556.465901] RBP: ffff8801f292f980 R08: ffffed003e73242d R09: ffffed003e73242d [ 556.467311] R10: 0000000000000001 R11: ffffed003e73242c R12: 00000000fffffc64 [ 556.468706] R13: ffff8801f3992000 R14: 0000000000000058 R15: 00000000ffff8801 [ 556.470117] FS: 00007f8029297840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 556.471702] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 556.472838] CR2: 000055f5f57305d8 CR3: 00000001f18b0000 CR4: 00000000000006e0 [ 556.474265] Call Trace: [ 556.474782] ? f2fs_alloc_nid_failed+0xf0/0xf0 [ 556.475686] ? truncate_nodes+0x980/0x980 [ 556.476516] ? pagecache_get_page+0x21f/0x2f0 [ 556.477412] ? __asan_loadN+0xf/0x20 [ 556.478153] ? __get_node_page+0x331/0x5b0 [ 556.478992] ? reweight_entity+0x1e6/0x3b0 [ 556.479826] f2fs_truncate_blocks+0x55e/0x740 [ 556.480709] ? f2fs_truncate_data_blocks+0x20/0x20 [ 556.481689] ? __radix_tree_lookup+0x34/0x160 [ 556.482630] ? radix_tree_lookup+0xd/0x10 [ 556.483445] f2fs_truncate+0xd4/0x1a0 [ 556.484206] f2fs_evict_inode+0x5ce/0x630 [ 556.485032] evict+0x16f/0x290 [ 556.485664] iput+0x280/0x300 [ 556.486300] dentry_unlink_inode+0x165/0x1e0 [ 556.487169] __dentry_kill+0x16a/0x260 [ 556.487936] dentry_kill+0x70/0x250 [ 556.488651] shrink_dentry_list+0x125/0x260 [ 556.489504] shrink_dcache_parent+0xc1/0x110 [ 556.490379] ? shrink_dcache_sb+0x200/0x200 [ 556.491231] ? bit_wait_timeout+0xc0/0xc0 [ 556.492047] do_one_tree+0x12/0x40 [ 556.492743] shrink_dcache_for_umount+0x3f/0xa0 [ 556.493656] generic_shutdown_super+0x43/0x1c0 [ 556.494561] kill_block_super+0x52/0x80 [ 556.495341] kill_f2fs_super+0x62/0x70 [ 556.496105] deactivate_locked_super+0x6f/0xa0 [ 556.497004] deactivate_super+0x5e/0x80 [ 556.497785] cleanup_mnt+0x61/0xa0 [ 556.498492] __cleanup_mnt+0x12/0x20 [ 556.499218] task_work_run+0xc8/0xf0 [ 556.499949] exit_to_usermode_loop+0x125/0x130 [ 556.500846] do_syscall_64+0x138/0x170 [ 556.501609] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 556.502659] RIP: 0033:0x7f8028b77487 [ 556.503384] Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 c9 2b 00 f7 d8 64 89 01 48 [ 556.507137] RSP: 002b:00007fff9f2e3598 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 556.508637] RAX: 0000000000000000 RBX: 0000000000ebd030 RCX: 00007f8028b77487 [ 556.510069] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000ec41e0 [ 556.511481] RBP: 0000000000ec41e0 R08: 0000000000000000 R09: 0000000000000014 [ 556.512892] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f802908083c [ 556.514320] R13: 0000000000000000 R14: 0000000000ebd210 R15: 00007fff9f2e3820 [ 556.515745] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 556.529276] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 556.530340] RIP: 0010:f2fs_truncate_inode_blocks+0x4a7/0x6f0 [ 556.531513] Code: e8 ae ea ff ff 41 89 c7 c1 e8 1f 84 c0 74 0a 41 83 ff fe 0f 85 35 ff ff ff 81 85 b0 fe ff ff fb 03 00 00 e9 f7 fd ff ff 0f 0b <0f> 0b e8 62 b7 9a 00 48 8b bd a0 fe ff ff e8 56 54 ae ff 48 8b b5 [ 556.535330] RSP: 0018:ffff8801f292f808 EFLAGS: 00010286 [ 556.536395] RAX: ffffed003e73242d RBX: ffff8801f292f958 RCX: ffffffffb88b81bc [ 556.537824] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff8801f3992164 [ 556.539290] RBP: ffff8801f292f980 R08: ffffed003e73242d R09: ffffed003e73242d [ 556.540709] R10: 0000000000000001 R11: ffffed003e73242c R12: 00000000fffffc64 [ 556.542131] R13: ffff8801f3992000 R14: 0000000000000058 R15: 00000000ffff8801 [ 556.543579] FS: 00007f8029297840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 556.545180] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 556.546338] CR2: 000055f5f57305d8 CR3: 00000001f18b0000 CR4: 00000000000006e0 [ 556.547809] ================================================================== [ 556.549248] BUG: KASAN: stack-out-of-bounds in arch_tlb_gather_mmu+0x52/0x170 [ 556.550672] Write of size 8 at addr ffff8801f292fd10 by task umount/1310 [ 556.552338] CPU: 1 PID: 1310 Comm: umount Tainted: G D 4.18.0-rc1+ #4 [ 556.553886] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 556.555756] Call Trace: [ 556.556264] dump_stack+0x7b/0xb5 [ 556.556944] print_address_description+0x70/0x290 [ 556.557903] kasan_report+0x291/0x390 [ 556.558649] ? arch_tlb_gather_mmu+0x52/0x170 [ 556.559537] __asan_store8+0x57/0x90 [ 556.560268] arch_tlb_gather_mmu+0x52/0x170 [ 556.561110] tlb_gather_mmu+0x12/0x40 [ 556.561862] exit_mmap+0x123/0x2a0 [ 556.562555] ? __ia32_sys_munmap+0x50/0x50 [ 556.563384] ? exit_aio+0x98/0x230 [ 556.564079] ? __x32_compat_sys_io_submit+0x260/0x260 [ 556.565099] ? taskstats_exit+0x1f4/0x640 [ 556.565925] ? kasan_check_read+0x11/0x20 [ 556.566739] ? mm_update_next_owner+0x322/0x380 [ 556.567652] mmput+0x8b/0x1d0 [ 556.568260] do_exit+0x43a/0x1390 [ 556.568937] ? mm_update_next_owner+0x380/0x380 [ 556.569855] ? deactivate_super+0x5e/0x80 [ 556.570668] ? cleanup_mnt+0x61/0xa0 [ 556.571395] ? __cleanup_mnt+0x12/0x20 [ 556.572156] ? task_work_run+0xc8/0xf0 [ 556.572917] ? exit_to_usermode_loop+0x125/0x130 [ 556.573861] rewind_stack_do_exit+0x17/0x20 [ 556.574707] RIP: 0033:0x7f8028b77487 [ 556.575428] Code: Bad RIP value. [ 556.576106] RSP: 002b:00007fff9f2e3598 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 556.577599] RAX: 0000000000000000 RBX: 0000000000ebd030 RCX: 00007f8028b77487 [ 556.579020] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000ec41e0 [ 556.580422] RBP: 0000000000ec41e0 R08: 0000000000000000 R09: 0000000000000014 [ 556.581833] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f802908083c [ 556.583252] R13: 0000000000000000 R14: 0000000000ebd210 R15: 00007fff9f2e3820 [ 556.584983] The buggy address belongs to the page: [ 556.585961] page:ffffea0007ca4bc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 556.587540] flags: 0x2ffff0000000000() [ 556.588296] raw: 02ffff0000000000 0000000000000000 dead000000000200 0000000000000000 [ 556.589822] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 556.591359] page dumped because: kasan: bad access detected [ 556.592786] Memory state around the buggy address: [ 556.593753] ffff8801f292fc00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 556.595191] ffff8801f292fc80: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 00 00 00 [ 556.596613] >ffff8801f292fd00: 00 00 f3 00 00 00 00 f3 f3 00 00 00 00 f4 f4 f4 [ 556.598044] ^ [ 556.598797] ffff8801f292fd80: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 [ 556.600225] ffff8801f292fe00: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f4 f4 f4 [ 556.601647] ================================================================== - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/node.c#L987 case NODE_DIND_BLOCK: err = truncate_nodes(&dn, nofs, offset[1], 3); cont = 0; break; default: BUG(); <--- } Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 22810d30c054..f490393397d4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -193,9 +193,30 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } -static bool sanity_check_inode(struct inode *inode) +static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned long long iblocks; + + iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + if (!iblocks) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, " + "run fsck to fix.", + __func__, inode->i_ino, iblocks); + return false; + } + + if (ino_of_node(node_page) != nid_of_node(node_page)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode footer i_ino=%lx, ino,nid: " + "[%u, %u] run fsck to fix.", + __func__, inode->i_ino, + ino_of_node(node_page), nid_of_node(node_page)); + return false; + } if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && !f2fs_has_extra_attr(inode)) { @@ -267,7 +288,7 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); - if (!sanity_check_inode(inode)) { + if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From 4dbe38dc386910c668c75ae616b99b823b59f3eb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 30 Jun 2018 18:13:40 +0800 Subject: f2fs: fix to do sanity check with reserved blkaddr of inline inode As Wen Xu reported in bugzilla, after image was injected with random data by fuzzing, inline inode would contain invalid reserved blkaddr, then during inline conversion, we will encounter illegal memory accessing reported by KASAN, the root cause of this is when writing out converted inline page, we will use invalid reserved blkaddr to update sit bitmap, result in accessing memory beyond sit bitmap boundary. In order to fix this issue, let's do sanity check with reserved block address of inline inode to avoid above condition. https://bugzilla.kernel.org/show_bug.cgi?id=200179 [ 1428.846352] BUG: KASAN: use-after-free in update_sit_entry+0x80/0x7f0 [ 1428.846618] Read of size 4 at addr ffff880194483540 by task a.out/2741 [ 1428.846855] CPU: 0 PID: 2741 Comm: a.out Tainted: G W 4.17.0+ #1 [ 1428.846858] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 1428.846860] Call Trace: [ 1428.846868] dump_stack+0x71/0xab [ 1428.846875] print_address_description+0x6b/0x290 [ 1428.846881] kasan_report+0x28e/0x390 [ 1428.846888] ? update_sit_entry+0x80/0x7f0 [ 1428.846898] update_sit_entry+0x80/0x7f0 [ 1428.846906] f2fs_allocate_data_block+0x6db/0xc70 [ 1428.846914] ? f2fs_get_node_info+0x14f/0x590 [ 1428.846920] do_write_page+0xc8/0x150 [ 1428.846928] f2fs_outplace_write_data+0xfe/0x210 [ 1428.846935] ? f2fs_do_write_node_page+0x170/0x170 [ 1428.846941] ? radix_tree_tag_clear+0xff/0x130 [ 1428.846946] ? __mod_node_page_state+0x22/0xa0 [ 1428.846951] ? inc_zone_page_state+0x54/0x100 [ 1428.846956] ? __test_set_page_writeback+0x336/0x5d0 [ 1428.846964] f2fs_convert_inline_page+0x407/0x6d0 [ 1428.846971] ? f2fs_read_inline_data+0x3b0/0x3b0 [ 1428.846978] ? __get_node_page+0x335/0x6b0 [ 1428.846987] f2fs_convert_inline_inode+0x41b/0x500 [ 1428.846994] ? f2fs_convert_inline_page+0x6d0/0x6d0 [ 1428.847000] ? kasan_unpoison_shadow+0x31/0x40 [ 1428.847005] ? kasan_kmalloc+0xa6/0xd0 [ 1428.847024] f2fs_file_mmap+0x79/0xc0 [ 1428.847029] mmap_region+0x58b/0x880 [ 1428.847037] ? arch_get_unmapped_area+0x370/0x370 [ 1428.847042] do_mmap+0x55b/0x7a0 [ 1428.847048] vm_mmap_pgoff+0x16f/0x1c0 [ 1428.847055] ? vma_is_stack_for_current+0x50/0x50 [ 1428.847062] ? __fsnotify_update_child_dentry_flags.part.1+0x160/0x160 [ 1428.847068] ? do_sys_open+0x206/0x2a0 [ 1428.847073] ? __fget+0xb4/0x100 [ 1428.847079] ksys_mmap_pgoff+0x278/0x360 [ 1428.847085] ? find_mergeable_anon_vma+0x50/0x50 [ 1428.847091] do_syscall_64+0x73/0x160 [ 1428.847098] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1428.847102] RIP: 0033:0x7fb1430766ba [ 1428.847103] Code: 89 f5 41 54 49 89 fc 55 53 74 35 49 63 e8 48 63 da 4d 89 f9 49 89 e8 4d 63 d6 48 89 da 4c 89 ee 4c 89 e7 b8 09 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 56 5b 5d 41 5c 41 5d 41 5e 41 5f c3 0f 1f 00 [ 1428.847162] RSP: 002b:00007ffc651d9388 EFLAGS: 00000246 ORIG_RAX: 0000000000000009 [ 1428.847167] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fb1430766ba [ 1428.847170] RDX: 0000000000000001 RSI: 0000000000001000 RDI: 0000000000000000 [ 1428.847173] RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000000 [ 1428.847176] R10: 0000000000008002 R11: 0000000000000246 R12: 0000000000000000 [ 1428.847179] R13: 0000000000001000 R14: 0000000000008002 R15: 0000000000000000 [ 1428.847252] Allocated by task 2683: [ 1428.847372] kasan_kmalloc+0xa6/0xd0 [ 1428.847380] kmem_cache_alloc+0xc8/0x1e0 [ 1428.847385] getname_flags+0x73/0x2b0 [ 1428.847390] user_path_at_empty+0x1d/0x40 [ 1428.847395] vfs_statx+0xc1/0x150 [ 1428.847401] __do_sys_newlstat+0x7e/0xd0 [ 1428.847405] do_syscall_64+0x73/0x160 [ 1428.847411] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1428.847466] Freed by task 2683: [ 1428.847566] __kasan_slab_free+0x137/0x190 [ 1428.847571] kmem_cache_free+0x85/0x1e0 [ 1428.847575] filename_lookup+0x191/0x280 [ 1428.847580] vfs_statx+0xc1/0x150 [ 1428.847585] __do_sys_newlstat+0x7e/0xd0 [ 1428.847590] do_syscall_64+0x73/0x160 [ 1428.847596] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1428.847648] The buggy address belongs to the object at ffff880194483300 which belongs to the cache names_cache of size 4096 [ 1428.847946] The buggy address is located 576 bytes inside of 4096-byte region [ffff880194483300, ffff880194484300) [ 1428.848234] The buggy address belongs to the page: [ 1428.848366] page:ffffea0006512000 count:1 mapcount:0 mapping:ffff8801f3586380 index:0x0 compound_mapcount: 0 [ 1428.848606] flags: 0x17fff8000008100(slab|head) [ 1428.848737] raw: 017fff8000008100 dead000000000100 dead000000000200 ffff8801f3586380 [ 1428.848931] raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 [ 1428.849122] page dumped because: kasan: bad access detected [ 1428.849305] Memory state around the buggy address: [ 1428.849436] ffff880194483400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.849620] ffff880194483480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.849804] >ffff880194483500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.849985] ^ [ 1428.850120] ffff880194483580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.850303] ffff880194483600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1428.850498] ================================================================== Reported-by: Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 9a245d2d5b7c..2bcb2d36f024 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -130,6 +130,16 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(dn); + set_sbi_flag(fio.sbi, SBI_NEED_FSCK); + f2fs_msg(fio.sbi->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dn->inode->i_ino, dn->data_blkaddr); + return -EINVAL; + } + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); f2fs_do_read_inline_data(page, dn->inode_page); @@ -363,6 +373,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, if (err) goto out; + if (unlikely(dn.data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(&dn); + set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + f2fs_msg(F2FS_P_SB(page)->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dir->i_ino, dn.data_blkaddr); + err = -EINVAL; + goto out; + } + f2fs_wait_on_page_writeback(page, DATA, true); dentry_blk = page_address(page); -- cgit v1.2.3-59-g8ed1b From 2d3a58566f66465bbcd70afafbb66b001beb9d12 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 30 Jun 2018 23:57:03 +0800 Subject: f2fs: avoid the global name 'fault_name' Non-prefix global name 'fault_name' will pollute global namespace, fix it. Refer to: https://lists.01.org/pipermail/kbuild-all/2018-June/049660.html To: Jaegeuk Kim To: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Cc: linux-kernel@vger.kernel.org Reported-by: kbuild test robot Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index efcb45d0e7d3..75a14d27dabc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -65,7 +65,7 @@ struct f2fs_fault_info { unsigned int inject_type; }; -extern char *fault_name[FAULT_MAX]; +extern char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif @@ -1283,7 +1283,7 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(type) \ printk("%sF2FS-fs : inject %s in %s of %pF\n", \ - KERN_INFO, fault_name[type], \ + KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4d0de436bbee..5e75a2bc4a45 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -41,7 +41,7 @@ static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION -char *fault_name[FAULT_MAX] = { +char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", -- cgit v1.2.3-59-g8ed1b From b1385478187a90d4273c328aa8dcad8d47c350a2 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 1 Jul 2018 13:57:06 -0700 Subject: f2fs: Replace strncpy with memcpy gcc 8.1.0 complains: fs/f2fs/namei.c: In function 'f2fs_update_extension_list': fs/f2fs/namei.c:257:3: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length fs/f2fs/namei.c:249:3: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length Using strncpy() is indeed less than perfect since the length of data to be copied has already been determined with strlen(). Replace strncpy() with memcpy() to address the warning and optimize the code a little. Signed-off-by: Guenter Roeck Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2ea0de4cbe76..1f67e389169f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -246,7 +246,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return -EINVAL; if (hot) { - strncpy(extlist[count], name, strlen(name)); + memcpy(extlist[count], name, strlen(name)); sbi->raw_super->hot_ext_count = hot_count + 1; } else { char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; @@ -254,7 +254,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, memcpy(buf, &extlist[cold_count], F2FS_EXTENSION_LEN * hot_count); memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); - strncpy(extlist[cold_count], name, strlen(name)); + memcpy(extlist[cold_count], name, strlen(name)); memcpy(&extlist[cold_count + 1], buf, F2FS_EXTENSION_LEN * hot_count); sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); -- cgit v1.2.3-59-g8ed1b From 68c43a235e8c8fa69322e8239762242cb3e752cb Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 2 Jul 2018 10:40:19 +0800 Subject: f2fs: check the right return value of memory alloc function This patch check the right return value of memory alloc function Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 142b34130749..6f21319d08d3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2798,7 +2798,7 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); - if (!nm_i->free_nid_bitmap) + if (!nm_i->free_nid_bitmap[i]) return -ENOMEM; } -- cgit v1.2.3-59-g8ed1b From dc1328027b53586cc6b668c6654f9482e505699c Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 2 Jul 2018 11:37:40 +0530 Subject: f2fs: show the fsync_mode=nobarrier mount option This patch shows the fsync_mode=nobarrier mount option in f2fs_show_options(). Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5e75a2bc4a45..98bfccc1d389 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1342,6 +1342,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) seq_printf(seq, ",fsync_mode=%s", "strict"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) + seq_printf(seq, ",fsync_mode=%s", "nobarrier"); return 0; } -- cgit v1.2.3-59-g8ed1b From 4b270a8cc5047682f0a3f3f9af3b498408dbd2bc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Jul 2018 18:04:10 +0800 Subject: f2fs: try grabbing node page lock aggressively in sync scenario In synchronous scenario, like in checkpoint(), we are going to flush dirty node pages to device synchronously, we can easily failed writebacking node page due to trylock_page() failure, especially in condition of intensive lock competition, which can cause long latency of checkpoint(). So let's use lock_page() in synchronous scenario to avoid this issue. Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6f21319d08d3..82664733f770 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1638,7 +1638,9 @@ next_step: !is_cold_node(page))) continue; lock_node: - if (!trylock_page(page)) + if (wbc->sync_mode == WB_SYNC_ALL) + lock_page(page); + else if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { -- cgit v1.2.3-59-g8ed1b From 10d255c3540239c7920f52d2eb223756e186af56 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Jul 2018 21:20:05 +0800 Subject: f2fs: fix to skip GC if type in SSA and SIT is inconsistent If segment type in SSA and SIT is inconsistent, we will encounter below BUG_ON during GC, to avoid this panic, let's just skip doing GC on such segment. The bug is triggered with image reported in below link: https://bugzilla.kernel.org/show_bug.cgi?id=200223 [ 388.060262] ------------[ cut here ]------------ [ 388.060268] kernel BUG at /home/y00370721/git/devf2fs/gc.c:989! [ 388.061172] invalid opcode: 0000 [#1] SMP [ 388.061773] Modules linked in: f2fs(O) bluetooth ecdh_generic xt_tcpudp iptable_filter ip_tables x_tables lp ttm drm_kms_helper drm intel_rapl sb_edac crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel fb_sys_fops ppdev aes_x86_64 syscopyarea crypto_simd sysfillrect parport_pc joydev sysimgblt glue_helper parport cryptd i2c_piix4 serio_raw mac_hid btrfs hid_generic usbhid hid raid6_pq psmouse pata_acpi floppy [ 388.064247] CPU: 7 PID: 4151 Comm: f2fs_gc-7:0 Tainted: G O 4.13.0-rc1+ #26 [ 388.065306] Hardware name: Xen HVM domU, BIOS 4.1.2_115-900.260_ 11/06/2015 [ 388.066058] task: ffff880201583b80 task.stack: ffffc90004d7c000 [ 388.069948] RIP: 0010:do_garbage_collect+0xcc8/0xcd0 [f2fs] [ 388.070766] RSP: 0018:ffffc90004d7fc68 EFLAGS: 00010202 [ 388.071783] RAX: ffff8801ed227000 RBX: 0000000000000001 RCX: ffffea0007b489c0 [ 388.072700] RDX: ffff880000000000 RSI: 0000000000000001 RDI: ffffea0007b489c0 [ 388.073607] RBP: ffffc90004d7fd58 R08: 0000000000000003 R09: ffffea0007b489dc [ 388.074619] R10: 0000000000000000 R11: 0052782ab317138d R12: 0000000000000018 [ 388.075625] R13: 0000000000000018 R14: ffff880211ceb000 R15: ffff880211ceb000 [ 388.076687] FS: 0000000000000000(0000) GS:ffff880214fc0000(0000) knlGS:0000000000000000 [ 388.083277] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 388.084536] CR2: 0000000000e18c60 CR3: 00000001ecf2e000 CR4: 00000000001406e0 [ 388.085748] Call Trace: [ 388.086690] ? find_next_bit+0xb/0x10 [ 388.088091] f2fs_gc+0x1a8/0x9d0 [f2fs] [ 388.088888] ? lock_timer_base+0x7d/0xa0 [ 388.090213] ? try_to_del_timer_sync+0x44/0x60 [ 388.091698] gc_thread_func+0x342/0x4b0 [f2fs] [ 388.092892] ? wait_woken+0x80/0x80 [ 388.094098] kthread+0x109/0x140 [ 388.095010] ? f2fs_gc+0x9d0/0x9d0 [f2fs] [ 388.096043] ? kthread_park+0x60/0x60 [ 388.097281] ret_from_fork+0x25/0x30 [ 388.098401] Code: ff ff 48 83 e8 01 48 89 44 24 58 e9 27 f8 ff ff 48 83 e8 01 e9 78 fc ff ff 48 8d 78 ff e9 17 fb ff ff 48 83 ef 01 e9 4d f4 ff ff <0f> 0b 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 [ 388.100864] RIP: do_garbage_collect+0xcc8/0xcd0 [f2fs] RSP: ffffc90004d7fc68 [ 388.101810] ---[ end trace 81c73d6e6b7da61d ]--- Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9093be6e7a7d..37ab2d10a872 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -986,7 +986,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, goto next; sum = page_address(sum_page); - f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); + if (type != GET_SUM_TYPE((&sum->footer))) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) " + "type [%d, %d] in SSA and SIT", + segno, type, GET_SUM_TYPE((&sum->footer))); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto next; + } /* * this is to avoid deadlock: -- cgit v1.2.3-59-g8ed1b From c9b60788fc760d136211853f10ce73dc152d1f4a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Aug 2018 19:13:44 +0800 Subject: f2fs: fix to do sanity check with block address in main area This patch add to do sanity check with below field: - cp_pack_total_block_count - blkaddr of data/node - extent info - Overview BUG() in verify_block_addr() when writing to a corrupted f2fs image - Reproduce (4.18 upstream kernel) - POC (poc.c) static void activity(char *mpoint) { char *foo_bar_baz; int err; static int buf[8192]; memset(buf, 0, sizeof(buf)); err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777); if (fd >= 0) { write(fd, (char *)buf, sizeof(buf)); fdatasync(fd); close(fd); } } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - Kernel message [ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3 [ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240 [ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4 [ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240 [ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d [ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202 [ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113 [ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540 [ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8 [ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540 [ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450 [ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0 [ 699.729171] Call Trace: [ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00 [ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0 [ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280 [ 699.729269] ? __radix_tree_replace+0xa3/0x120 [ 699.729276] __write_data_page+0x5c7/0xe30 [ 699.729291] ? kasan_check_read+0x11/0x20 [ 699.729310] ? page_mapped+0x8a/0x110 [ 699.729321] ? page_mkclean+0xe9/0x160 [ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00 [ 699.729331] ? invalid_page_referenced_vma+0x130/0x130 [ 699.729345] ? clear_page_dirty_for_io+0x332/0x450 [ 699.729351] f2fs_write_cache_pages+0x4ca/0x860 [ 699.729358] ? __write_data_page+0xe30/0xe30 [ 699.729374] ? percpu_counter_add_batch+0x22/0xa0 [ 699.729380] ? kasan_check_write+0x14/0x20 [ 699.729391] ? _raw_spin_lock+0x17/0x40 [ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30 [ 699.729413] ? iov_iter_advance+0x113/0x640 [ 699.729418] ? f2fs_write_end+0x133/0x2e0 [ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640 [ 699.729428] f2fs_write_data_pages+0x329/0x520 [ 699.729433] ? generic_perform_write+0x250/0x320 [ 699.729438] ? f2fs_write_cache_pages+0x860/0x860 [ 699.729454] ? current_time+0x110/0x110 [ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370 [ 699.729464] do_writepages+0x37/0xb0 [ 699.729468] ? f2fs_write_cache_pages+0x860/0x860 [ 699.729472] ? do_writepages+0x37/0xb0 [ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0 [ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0 [ 699.729496] ? __vfs_write+0x2b2/0x410 [ 699.729501] file_write_and_wait_range+0x66/0xb0 [ 699.729506] f2fs_do_sync_file+0x1f9/0xd90 [ 699.729511] ? truncate_partial_data_page+0x290/0x290 [ 699.729521] ? __sb_end_write+0x30/0x50 [ 699.729526] ? vfs_write+0x20f/0x260 [ 699.729530] f2fs_sync_file+0x9a/0xb0 [ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90 [ 699.729548] vfs_fsync_range+0x68/0x100 [ 699.729554] ? __fget_light+0xc9/0xe0 [ 699.729558] do_fsync+0x3d/0x70 [ 699.729562] __x64_sys_fdatasync+0x24/0x30 [ 699.729585] do_syscall_64+0x78/0x170 [ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 699.729613] RIP: 0033:0x7f9bf930d800 [ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24 [ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b [ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800 [ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003 [ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000 [ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610 [ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000 [ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 699.729782] ------------[ cut here ]------------ [ 699.729785] kernel BUG at fs/f2fs/segment.h:654! [ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI [ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4 [ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730 [ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0 [ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283 [ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef [ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c [ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55 [ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940 [ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001 [ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0 [ 699.752874] Call Trace: [ 699.753386] ? f2fs_inplace_write_data+0x93/0x240 [ 699.754341] f2fs_inplace_write_data+0xd2/0x240 [ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00 [ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0 [ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280 [ 699.758209] ? __radix_tree_replace+0xa3/0x120 [ 699.759164] __write_data_page+0x5c7/0xe30 [ 699.760002] ? kasan_check_read+0x11/0x20 [ 699.760823] ? page_mapped+0x8a/0x110 [ 699.761573] ? page_mkclean+0xe9/0x160 [ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00 [ 699.763332] ? invalid_page_referenced_vma+0x130/0x130 [ 699.764374] ? clear_page_dirty_for_io+0x332/0x450 [ 699.765347] f2fs_write_cache_pages+0x4ca/0x860 [ 699.766276] ? __write_data_page+0xe30/0xe30 [ 699.767161] ? percpu_counter_add_batch+0x22/0xa0 [ 699.768112] ? kasan_check_write+0x14/0x20 [ 699.768951] ? _raw_spin_lock+0x17/0x40 [ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30 [ 699.770885] ? iov_iter_advance+0x113/0x640 [ 699.771743] ? f2fs_write_end+0x133/0x2e0 [ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640 [ 699.773680] f2fs_write_data_pages+0x329/0x520 [ 699.774603] ? generic_perform_write+0x250/0x320 [ 699.775544] ? f2fs_write_cache_pages+0x860/0x860 [ 699.776510] ? current_time+0x110/0x110 [ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370 [ 699.778279] do_writepages+0x37/0xb0 [ 699.779026] ? f2fs_write_cache_pages+0x860/0x860 [ 699.779978] ? do_writepages+0x37/0xb0 [ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0 [ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0 [ 699.782820] ? __vfs_write+0x2b2/0x410 [ 699.783597] file_write_and_wait_range+0x66/0xb0 [ 699.784540] f2fs_do_sync_file+0x1f9/0xd90 [ 699.785381] ? truncate_partial_data_page+0x290/0x290 [ 699.786415] ? __sb_end_write+0x30/0x50 [ 699.787204] ? vfs_write+0x20f/0x260 [ 699.787941] f2fs_sync_file+0x9a/0xb0 [ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90 [ 699.789572] vfs_fsync_range+0x68/0x100 [ 699.790360] ? __fget_light+0xc9/0xe0 [ 699.791128] do_fsync+0x3d/0x70 [ 699.791779] __x64_sys_fdatasync+0x24/0x30 [ 699.792614] do_syscall_64+0x78/0x170 [ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 699.794406] RIP: 0033:0x7f9bf930d800 [ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24 [ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b [ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800 [ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003 [ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000 [ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610 [ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000 [ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]--- [ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730 [ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0 [ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283 [ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef [ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c [ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55 [ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940 [ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001 [ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000 [ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0 [ 699.835556] ================================================================== [ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0 [ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309 [ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4 [ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 699.843475] Call Trace: [ 699.843982] dump_stack+0x7b/0xb5 [ 699.844661] print_address_description+0x70/0x290 [ 699.845607] kasan_report+0x291/0x390 [ 699.846351] ? update_stack_state+0x38c/0x3e0 [ 699.853831] __asan_load8+0x54/0x90 [ 699.854569] update_stack_state+0x38c/0x3e0 [ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20 [ 699.856601] ? __save_stack_trace+0x5e/0x100 [ 699.857476] unwind_next_frame.part.5+0x18e/0x490 [ 699.858448] ? unwind_dump+0x290/0x290 [ 699.859217] ? clear_page_dirty_for_io+0x332/0x450 [ 699.860185] __unwind_start+0x106/0x190 [ 699.860974] __save_stack_trace+0x5e/0x100 [ 699.861808] ? __save_stack_trace+0x5e/0x100 [ 699.862691] ? unlink_anon_vmas+0xba/0x2c0 [ 699.863525] save_stack_trace+0x1f/0x30 [ 699.864312] save_stack+0x46/0xd0 [ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420 [ 699.865990] ? flush_tlb_mm_range+0x15e/0x220 [ 699.866889] ? kasan_check_write+0x14/0x20 [ 699.867724] ? __dec_node_state+0x92/0xb0 [ 699.868543] ? lock_page_memcg+0x85/0xf0 [ 699.869350] ? unlock_page_memcg+0x16/0x80 [ 699.870185] ? page_remove_rmap+0x198/0x520 [ 699.871048] ? mark_page_accessed+0x133/0x200 [ 699.871930] ? _cond_resched+0x1a/0x50 [ 699.872700] ? unmap_page_range+0xcd4/0xe50 [ 699.873551] ? rb_next+0x58/0x80 [ 699.874217] ? rb_next+0x58/0x80 [ 699.874895] __kasan_slab_free+0x13c/0x1a0 [ 699.875734] ? unlink_anon_vmas+0xba/0x2c0 [ 699.876563] kasan_slab_free+0xe/0x10 [ 699.877315] kmem_cache_free+0x89/0x1e0 [ 699.878095] unlink_anon_vmas+0xba/0x2c0 [ 699.878913] free_pgtables+0x101/0x1b0 [ 699.879677] exit_mmap+0x146/0x2a0 [ 699.880378] ? __ia32_sys_munmap+0x50/0x50 [ 699.881214] ? kasan_check_read+0x11/0x20 [ 699.882052] ? mm_update_next_owner+0x322/0x380 [ 699.882985] mmput+0x8b/0x1d0 [ 699.883602] do_exit+0x43a/0x1390 [ 699.884288] ? mm_update_next_owner+0x380/0x380 [ 699.885212] ? f2fs_sync_file+0x9a/0xb0 [ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90 [ 699.886877] ? vfs_fsync_range+0x68/0x100 [ 699.887694] ? __fget_light+0xc9/0xe0 [ 699.888442] ? do_fsync+0x3d/0x70 [ 699.889118] ? __x64_sys_fdatasync+0x24/0x30 [ 699.889996] rewind_stack_do_exit+0x17/0x20 [ 699.890860] RIP: 0033:0x7f9bf930d800 [ 699.891585] Code: Bad RIP value. [ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b [ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800 [ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003 [ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000 [ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610 [ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000 [ 699.901241] The buggy address belongs to the page: [ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 699.903811] flags: 0x2ffff0000000000() [ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000 [ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000 [ 699.907673] page dumped because: kasan: bad access detected [ 699.909108] Memory state around the buggy address: [ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00 [ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2 [ 699.914392] ^ [ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00 [ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00 [ 699.918634] ================================================================== - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644 Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 22 +++++++++++++++++++--- fs/f2fs/data.c | 33 +++++++++++++++++++++++++++------ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/file.c | 12 ++++++++++++ fs/f2fs/inode.c | 17 +++++++++++++++++ fs/f2fs/node.c | 4 ++++ fs/f2fs/segment.h | 3 +-- 7 files changed, 83 insertions(+), 11 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 94552286ac12..b766f78b05f9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -86,8 +86,10 @@ repeat: fio.page = page; if (f2fs_submit_page_bio(&fio)) { - f2fs_put_page(page, 1); - goto repeat; + memset(page_address(page), 0, PAGE_SIZE); + f2fs_stop_checkpoint(sbi, false); + f2fs_bug_on(sbi, 1); + return page; } lock_page(page); @@ -143,8 +145,14 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case META_POR: case DATA_GENERIC: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || - blkaddr < MAIN_BLKADDR(sbi))) + blkaddr < MAIN_BLKADDR(sbi))) { + if (type == DATA_GENERIC) { + f2fs_msg(sbi->sb, KERN_WARNING, + "access invalid blkaddr:%u", blkaddr); + WARN_ON(1); + } return false; + } break; case META_GENERIC: if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || @@ -771,6 +779,14 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, &cp_page_1, version); if (err) goto invalid_cp1; + + if (le32_to_cpu(cp_block->cp_pack_total_block_count) > + sbi->blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid cp_pack_total_block_count:%u", + le32_to_cpu(cp_block->cp_pack_total_block_count)); + goto invalid_cp1; + } pre_version = *version; cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 635a98db5d65..4064ce246c13 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -441,7 +441,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; - verify_block_addr(fio, fio->new_blkaddr); + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + return -EFAULT; + trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); @@ -1045,6 +1048,12 @@ next_dnode: next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + err = -EFAULT; + goto sync_out; + } + if (!is_valid_data_blkaddr(sbi, blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { @@ -1500,6 +1509,10 @@ got_it: SetPageUptodate(page); goto confused; } + + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC)) + goto set_error_page; } else { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) @@ -1700,11 +1713,13 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr)) { - ipu_force = true; - fio->need_lock = LOCK_DONE; - goto got_it; - } + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) + return -EFAULT; + + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; } /* Deadlock due to between page->lock and f2fs_lock_op */ @@ -1723,6 +1738,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } got_it: + if (__is_valid_data_blkaddr(fio->old_blkaddr) && + !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) { + err = -EFAULT; + goto out_writepage; + } /* * If current allocation needs SSR, * it had better in-place writes for updated data. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 75a14d27dabc..c8c865fa8450 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2669,6 +2669,9 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ + (!is_read_io(fio->op) || fio->is_meta)) + bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0d1ee20912b9..5e29d4053748 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -420,6 +420,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + blkaddr, DATA_GENERIC)) { + f2fs_put_dnode(&dn); + goto fail; + } + if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -514,6 +521,11 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->data_blkaddr = NULL_ADDR; f2fs_set_data_blkaddr(dn); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + continue; + f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f490393397d4..b52440f06fa5 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -236,6 +236,23 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) __func__, inode->i_ino); return false; } + + if (F2FS_I(inode)->extent_tree) { + struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) extent info [%u, %u, %u] " + "is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + } return true; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 82664733f770..b18b7522c4d5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1401,6 +1401,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, return 0; } + if (__is_valid_data_blkaddr(ni.blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) + goto redirty_out; + if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a7460da9af43..b5bd3287e104 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -645,8 +645,7 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { struct f2fs_sb_info *sbi = fio->sbi; - if (PAGE_TYPE_OF_BIO(fio->type) == META && - (!is_read_io(fio->op) || fio->is_meta)) + if (__is_meta_io(fio)) verify_blkaddr(sbi, blk_addr, META_GENERIC); else verify_blkaddr(sbi, blk_addr, DATA_GENERIC); -- cgit v1.2.3-59-g8ed1b From 82902c06bd17dbf6e8184299842ca5c68880970f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 Jul 2018 19:37:00 +0800 Subject: f2fs: fix to detect looped node chain correctly Below dmesg was printed when testing generic/388 of fstest: F2FS-fs (zram1): find_fsync_dnodes: detect looped node chain, blkaddr:526615, next:526616 F2FS-fs (zram1): Cannot recover all fsync data errno=-22 F2FS-fs (zram1): Mounted with checkpoint version = 22300d0e F2FS-fs (zram1): find_fsync_dnodes: detect looped node chain, blkaddr:526615, next:526616 F2FS-fs (zram1): Cannot recover all fsync data errno=-22 The reason is that we initialize free_blocks with free blocks of filesystem, so if filesystem is full, free_blocks can be zero, below condition will be true, so that, it will fail recovery. if (++loop_cnt >= free_blocks || blkaddr == next_blkaddr_of_node(page)) To fix this issue, initialize free_blocks with correct value which includes over-privision blocks. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3051a5e5dfc7..0d927ae26c48 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -241,8 +241,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct page *page = NULL; block_t blkaddr; unsigned int loop_cnt = 0; - unsigned int free_blocks = sbi->user_block_count - - valid_user_blocks(sbi); + unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - + valid_user_blocks(sbi); int err = 0; /* get node pages in the current segment */ -- cgit v1.2.3-59-g8ed1b From a39e5365835edcdb12140d423573c2b8ed39ebfb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 Jul 2018 14:24:11 +0800 Subject: f2fs: enable real-time discard by default f2fs is focused on flash based storage, so let's enable real-time discard by default, if user don't want to enable it, 'nodiscard' mount option should be used on mount. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 98bfccc1d389..609ea8736dbf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1367,12 +1367,12 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_has_blkzoned(sbi->sb)) { - set_opt_mode(sbi, F2FS_MOUNT_LFS); + if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev))) set_opt(sbi, DISCARD); - } else { + if (f2fs_sb_has_blkzoned(sbi->sb)) + set_opt_mode(sbi, F2FS_MOUNT_LFS); + else set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); - } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); -- cgit v1.2.3-59-g8ed1b From cb15d1e43db0a6341c1e26ac6a2c74e61b74f1aa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Jul 2018 20:50:57 -0700 Subject: f2fs: fix defined but not used build warnings Fix build warnings in f2fs when CONFIG_PROC_FS is not enabled by marking the unused functions as __maybe_unused. ../fs/f2fs/sysfs.c:519:12: warning: 'segment_info_seq_show' defined but not used [-Wunused-function] ../fs/f2fs/sysfs.c:546:12: warning: 'segment_bits_seq_show' defined but not used [-Wunused-function] ../fs/f2fs/sysfs.c:570:12: warning: 'iostat_info_seq_show' defined but not used [-Wunused-function] Signed-off-by: Randy Dunlap Cc: Jaegeuk Kim Cc: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d3d0266a49da..bca1236fd6fa 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -9,6 +9,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include #include #include #include @@ -518,7 +519,8 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; -static int segment_info_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused segment_info_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -545,7 +547,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_bits_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -569,7 +572,8 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } -static int iostat_info_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); -- cgit v1.2.3-59-g8ed1b From 2482c4325dfe03b679a8bd7e0f1d14e230f019b3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:16:53 +0800 Subject: f2fs: detect bug_on in f2fs_wait_discard_bios Add bug_on to detect potential non-empty discard wait list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 47b6595a078c..199a77a9c4a9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1388,6 +1388,8 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) /* just to make sure there is no pending discard commands */ __wait_all_discard_cmd(sbi, NULL); + + f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); return dropped; } -- cgit v1.2.3-59-g8ed1b From 4c6b56c002caf0b3a9aaaeb493c59d53daeaff40 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:16:54 +0800 Subject: f2fs: clean up with IS_INODE() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b52440f06fa5..ccdf6abde5f7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -122,7 +122,7 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page if (!f2fs_sb_has_inode_chksum(sbi->sb)) return false; - if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), -- cgit v1.2.3-59-g8ed1b From 522d1711d62c4fb87d7468ea1ef76ef5c510fd6e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:08:09 +0800 Subject: f2fs: stop issuing discard immediately if there is queued IO For background discard policy, even if there is queued user IO, still we will check max_requests times for next discard entry, it is unneeded, let's just stop this round submission immediately. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 199a77a9c4a9..478a2a87d491 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1188,7 +1188,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0, issued = 0; + int i, issued = 0; bool io_interrupted = false; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { @@ -1209,20 +1209,19 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi)) { io_interrupted = true; - goto skip; + break; } __submit_discard_cmd(sbi, dpolicy, dc); - issued++; -skip: - if (++iter >= dpolicy->max_requests) + + if (++issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); - if (iter >= dpolicy->max_requests) + if (issued >= dpolicy->max_requests || io_interrupted) break; } -- cgit v1.2.3-59-g8ed1b From 20ee4382322cd9cf6ecfcf4f429ed108c617fb4a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:11:01 +0800 Subject: f2fs: issue small discard by LBA order For small granularity discard which size is smaller than 64KB, if we issue those kind of discards orderly by size, their IOs will be spread into entire logical address, so that in FTL, L2P table will be updated randomly, result bad wear rate in the table. In this patch, we choose to issue small discard by LBA order, by this way, we can expect that L2P table updates from adjacent discard IOs can be merged in the cache, so it can reduce lifetime wearing of flash. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c8c865fa8450..ed9a1135d56c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -291,6 +291,7 @@ struct discard_policy { unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ + bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ }; @@ -307,6 +308,7 @@ struct discard_cmd_control { unsigned int max_discards; /* max. discards to be issued */ unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ + unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 478a2a87d491..f5f04aabe338 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -934,6 +934,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; + dpolicy->ordered = false; dpolicy->granularity = granularity; dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; @@ -945,6 +946,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = true; dpolicy->sync = false; + dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1181,6 +1183,63 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + unsigned int pos = dcc->next_pos; + unsigned int issued = 0; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, pos, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc) { + struct rb_node *node; + + if (dc->state != D_PREP) + goto next; + + if (dpolicy->io_aware && !is_idle(sbi)) { + io_interrupted = true; + break; + } + + dcc->next_pos = dc->lstart + dc->len; + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) + break; +next: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + blk_finish_plug(&plug); + + if (!dc) + dcc->next_pos = 0; + + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { @@ -1194,6 +1253,10 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dpolicy->granularity) break; + + if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + return __issue_discard_cmd_orderly(sbi, dpolicy); + pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); @@ -1754,6 +1817,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; + dcc->next_pos = 0; dcc->root = RB_ROOT; dcc->rbtree_check = false; -- cgit v1.2.3-59-g8ed1b From 36b877af7992893b6d1ddbe96971cab5ab9e50eb Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 9 Jul 2018 20:32:42 -0700 Subject: f2fs: Keep alloc_valid_block_count in sync If we attempt to request more blocks than we have room for, we try to instead request as much as we can, however, alloc_valid_block_count is not decremented to match the new value, allowing it to drift higher until the next checkpoint. This always decrements it when the requested amount cannot be fulfilled. Signed-off-by: Daniel Rosenberg Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ed9a1135d56c..9e6b27596b26 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1686,18 +1686,20 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->total_valid_block_count -= diff; if (!*count) { spin_unlock(&sbi->stat_lock); - percpu_counter_sub(&sbi->alloc_valid_block_count, diff); goto enospc; } } spin_unlock(&sbi->stat_lock); - if (unlikely(release)) + if (unlikely(release)) { + percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); + } f2fs_i_blocks_write(inode, *count, true, true); return 0; enospc: + percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); return -ENOSPC; } -- cgit v1.2.3-59-g8ed1b From 3611ce9911267cb93d364bd71ddea6821278d11f Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 12 Jul 2018 23:09:26 +0800 Subject: f2fs: do not set free of current section For the case when sbi->segs_per_sec > 1, take section:segment = 5 for example, if segment 1 is just used and allocate new segment 2, and the blocks of segment 1 is invalidated, at this time, the previous code will use __set_test_and_free to free the free_secmap and free_sections++, this is not correct since it is still a current section, so fix it. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b5bd3287e104..50495515f0a0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -448,6 +448,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; + if (IS_CURSEC(sbi, secno)) + goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { @@ -455,6 +457,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_sections++; } } +skip_free: spin_unlock(&free_i->segmap_lock); } -- cgit v1.2.3-59-g8ed1b From 66415cee3d341b19eb2766c118cb5f6fddda077c Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 12 Jul 2018 23:09:28 +0800 Subject: f2fs: blk_finish_plug of submit_bio in lfs mode Expand the blk_finish_plug action from blkzoned to normal lfs mode, since plug will cause the out-of-order IO submission, which is not friendly to flash in lfs mode. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4064ce246c13..2b28f0a6f751 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -264,7 +264,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug) + if (test_opt(sbi, LFS) && current->plug) blk_finish_plug(current->plug); start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; -- cgit v1.2.3-59-g8ed1b From 18dd6470c2d14d10f5a2dd926925dc80dbd3abfd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Jul 2018 22:16:55 +0800 Subject: f2fs: fix to do sanity check with i_extra_isize If inode.i_extra_isize was fuzzed to an abnormal value, when calculating inline data size, the result will overflow, result in accessing invalid memory area when operating inline data. Let's do sanity check with i_extra_isize during inode loading for fixing. https://bugzilla.kernel.org/show_bug.cgi?id=200421 - Reproduce - POC (poc.c) #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void activity(char *mpoint) { char *foo_bar_baz; char *foo_baz; char *xattr; int err; err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint); err = asprintf(&foo_baz, "%s/foo/baz", mpoint); err = asprintf(&xattr, "%s/foo/bar/xattr", mpoint); rename(foo_bar_baz, foo_baz); char buf2[113]; memset(buf2, 0, sizeof(buf2)); listxattr(xattr, buf2, sizeof(buf2)); removexattr(xattr, "user.mime_type"); } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - Kernel message Umount the image will leave the following message [ 2910.995489] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 2918.416465] ================================================================== [ 2918.416807] BUG: KASAN: slab-out-of-bounds in f2fs_iget+0xcb9/0x1a80 [ 2918.417009] Read of size 4 at addr ffff88018efc2068 by task a.out/1229 [ 2918.417311] CPU: 1 PID: 1229 Comm: a.out Not tainted 4.17.0+ #1 [ 2918.417314] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 2918.417323] Call Trace: [ 2918.417366] dump_stack+0x71/0xab [ 2918.417401] print_address_description+0x6b/0x290 [ 2918.417407] kasan_report+0x28e/0x390 [ 2918.417411] ? f2fs_iget+0xcb9/0x1a80 [ 2918.417415] f2fs_iget+0xcb9/0x1a80 [ 2918.417422] ? f2fs_lookup+0x2e7/0x580 [ 2918.417425] f2fs_lookup+0x2e7/0x580 [ 2918.417433] ? __recover_dot_dentries+0x400/0x400 [ 2918.417447] ? legitimize_path.isra.29+0x5a/0xa0 [ 2918.417453] __lookup_slow+0x11c/0x220 [ 2918.417457] ? may_delete+0x2a0/0x2a0 [ 2918.417475] ? deref_stack_reg+0xe0/0xe0 [ 2918.417479] ? __lookup_hash+0xb0/0xb0 [ 2918.417483] lookup_slow+0x3e/0x60 [ 2918.417488] walk_component+0x3ac/0x990 [ 2918.417492] ? generic_permission+0x51/0x1e0 [ 2918.417495] ? inode_permission+0x51/0x1d0 [ 2918.417499] ? pick_link+0x3e0/0x3e0 [ 2918.417502] ? link_path_walk+0x4b1/0x770 [ 2918.417513] ? _raw_spin_lock_irqsave+0x25/0x50 [ 2918.417518] ? walk_component+0x990/0x990 [ 2918.417522] ? path_init+0x2e6/0x580 [ 2918.417526] path_lookupat+0x13f/0x430 [ 2918.417531] ? trailing_symlink+0x3a0/0x3a0 [ 2918.417534] ? do_renameat2+0x270/0x7b0 [ 2918.417538] ? __kasan_slab_free+0x14c/0x190 [ 2918.417541] ? do_renameat2+0x270/0x7b0 [ 2918.417553] ? kmem_cache_free+0x85/0x1e0 [ 2918.417558] ? do_renameat2+0x270/0x7b0 [ 2918.417563] filename_lookup+0x13c/0x280 [ 2918.417567] ? filename_parentat+0x2b0/0x2b0 [ 2918.417572] ? kasan_unpoison_shadow+0x31/0x40 [ 2918.417575] ? kasan_kmalloc+0xa6/0xd0 [ 2918.417593] ? strncpy_from_user+0xaa/0x1c0 [ 2918.417598] ? getname_flags+0x101/0x2b0 [ 2918.417614] ? path_listxattr+0x87/0x110 [ 2918.417619] path_listxattr+0x87/0x110 [ 2918.417623] ? listxattr+0xc0/0xc0 [ 2918.417637] ? mm_fault_error+0x1b0/0x1b0 [ 2918.417654] do_syscall_64+0x73/0x160 [ 2918.417660] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2918.417676] RIP: 0033:0x7f2f3a3480d7 [ 2918.417677] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 2918.417732] RSP: 002b:00007fff4095b7d8 EFLAGS: 00000206 ORIG_RAX: 00000000000000c2 [ 2918.417744] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2f3a3480d7 [ 2918.417746] RDX: 0000000000000071 RSI: 00007fff4095b810 RDI: 000000000126a0c0 [ 2918.417749] RBP: 00007fff4095b890 R08: 000000000126a010 R09: 0000000000000000 [ 2918.417751] R10: 00000000000001ab R11: 0000000000000206 R12: 00000000004005e0 [ 2918.417753] R13: 00007fff4095b990 R14: 0000000000000000 R15: 0000000000000000 [ 2918.417853] Allocated by task 329: [ 2918.418002] kasan_kmalloc+0xa6/0xd0 [ 2918.418007] kmem_cache_alloc+0xc8/0x1e0 [ 2918.418023] mempool_init_node+0x194/0x230 [ 2918.418027] mempool_init+0x12/0x20 [ 2918.418042] bioset_init+0x2bd/0x380 [ 2918.418052] blk_alloc_queue_node+0xe9/0x540 [ 2918.418075] dm_create+0x2c0/0x800 [ 2918.418080] dev_create+0xd2/0x530 [ 2918.418083] ctl_ioctl+0x2a3/0x5b0 [ 2918.418087] dm_ctl_ioctl+0xa/0x10 [ 2918.418092] do_vfs_ioctl+0x13e/0x8c0 [ 2918.418095] ksys_ioctl+0x66/0x70 [ 2918.418098] __x64_sys_ioctl+0x3d/0x50 [ 2918.418102] do_syscall_64+0x73/0x160 [ 2918.418106] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2918.418204] Freed by task 0: [ 2918.418301] (stack is not available) [ 2918.418521] The buggy address belongs to the object at ffff88018efc0000 which belongs to the cache biovec-max of size 8192 [ 2918.418894] The buggy address is located 104 bytes to the right of 8192-byte region [ffff88018efc0000, ffff88018efc2000) [ 2918.419257] The buggy address belongs to the page: [ 2918.419431] page:ffffea00063bf000 count:1 mapcount:0 mapping:ffff8801f2242540 index:0x0 compound_mapcount: 0 [ 2918.419702] flags: 0x17fff8000008100(slab|head) [ 2918.419879] raw: 017fff8000008100 dead000000000100 dead000000000200 ffff8801f2242540 [ 2918.420101] raw: 0000000000000000 0000000000030003 00000001ffffffff 0000000000000000 [ 2918.420322] page dumped because: kasan: bad access detected [ 2918.420599] Memory state around the buggy address: [ 2918.420764] ffff88018efc1f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 2918.420975] ffff88018efc1f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 2918.421194] >ffff88018efc2000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 2918.421406] ^ [ 2918.421627] ffff88018efc2080: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 2918.421838] ffff88018efc2100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 2918.422046] ================================================================== [ 2918.422264] Disabling lock debugging due to kernel taint [ 2923.901641] BUG: unable to handle kernel paging request at ffff88018f0db000 [ 2923.901884] PGD 22226a067 P4D 22226a067 PUD 222273067 PMD 18e642063 PTE 800000018f0db061 [ 2923.902120] Oops: 0003 [#1] SMP KASAN PTI [ 2923.902274] CPU: 1 PID: 1231 Comm: umount Tainted: G B 4.17.0+ #1 [ 2923.902490] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 2923.902761] RIP: 0010:__memset+0x24/0x30 [ 2923.902906] Code: 90 90 90 90 90 90 66 66 90 66 90 49 89 f9 48 89 d1 83 e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 f3 [ 2923.903446] RSP: 0018:ffff88018ddf7ae0 EFLAGS: 00010206 [ 2923.903622] RAX: 0000000000000000 RBX: ffff8801d549d888 RCX: 1ffffffffffdaffb [ 2923.903833] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88018f0daffc [ 2923.904062] RBP: ffff88018efc206c R08: 1ffff10031df840d R09: ffff88018efc206c [ 2923.904273] R10: ffffffffffffe1ee R11: ffffed0031df65fa R12: 0000000000000000 [ 2923.904485] R13: ffff8801d549dc98 R14: 00000000ffffc3db R15: ffffea00063bec80 [ 2923.904693] FS: 00007fa8b2f8a840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 2923.904937] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2923.910080] CR2: ffff88018f0db000 CR3: 000000018f892000 CR4: 00000000000006e0 [ 2923.914930] Call Trace: [ 2923.919724] f2fs_truncate_inline_inode+0x114/0x170 [ 2923.924487] f2fs_truncate_blocks+0x11b/0x7c0 [ 2923.929178] ? f2fs_truncate_data_blocks+0x10/0x10 [ 2923.933834] ? dqget+0x670/0x670 [ 2923.938437] ? f2fs_destroy_extent_tree+0xd6/0x270 [ 2923.943107] ? __radix_tree_lookup+0x2f/0x150 [ 2923.947772] f2fs_truncate+0xd4/0x1a0 [ 2923.952491] f2fs_evict_inode+0x5ab/0x610 [ 2923.957204] evict+0x15f/0x280 [ 2923.961898] __dentry_kill+0x161/0x250 [ 2923.966634] shrink_dentry_list+0xf3/0x250 [ 2923.971897] shrink_dcache_parent+0xa9/0x100 [ 2923.976561] ? shrink_dcache_sb+0x1f0/0x1f0 [ 2923.981177] ? wait_for_completion+0x8a/0x210 [ 2923.985781] ? migrate_swap_stop+0x2d0/0x2d0 [ 2923.990332] do_one_tree+0xe/0x40 [ 2923.994735] shrink_dcache_for_umount+0x3a/0xa0 [ 2923.999077] generic_shutdown_super+0x3e/0x1c0 [ 2924.003350] kill_block_super+0x4b/0x70 [ 2924.007619] deactivate_locked_super+0x65/0x90 [ 2924.011812] cleanup_mnt+0x5c/0xa0 [ 2924.015995] task_work_run+0xce/0xf0 [ 2924.020174] exit_to_usermode_loop+0x115/0x120 [ 2924.024293] do_syscall_64+0x12f/0x160 [ 2924.028479] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 2924.032709] RIP: 0033:0x7fa8b2868487 [ 2924.036888] Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 c9 2b 00 f7 d8 64 89 01 48 [ 2924.045750] RSP: 002b:00007ffc39824d58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 2924.050190] RAX: 0000000000000000 RBX: 00000000008ea030 RCX: 00007fa8b2868487 [ 2924.054604] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 00000000008f4360 [ 2924.058940] RBP: 00000000008f4360 R08: 0000000000000000 R09: 0000000000000014 [ 2924.063186] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007fa8b2d7183c [ 2924.067418] R13: 0000000000000000 R14: 00000000008ea210 R15: 00007ffc39824fe0 [ 2924.071534] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy [ 2924.098044] CR2: ffff88018f0db000 [ 2924.102520] ---[ end trace a8e0d899985faf31 ]--- [ 2924.107012] RIP: 0010:__memset+0x24/0x30 [ 2924.111448] Code: 90 90 90 90 90 90 66 66 90 66 90 49 89 f9 48 89 d1 83 e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 f3 [ 2924.120724] RSP: 0018:ffff88018ddf7ae0 EFLAGS: 00010206 [ 2924.125312] RAX: 0000000000000000 RBX: ffff8801d549d888 RCX: 1ffffffffffdaffb [ 2924.129931] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88018f0daffc [ 2924.134537] RBP: ffff88018efc206c R08: 1ffff10031df840d R09: ffff88018efc206c [ 2924.139175] R10: ffffffffffffe1ee R11: ffffed0031df65fa R12: 0000000000000000 [ 2924.143825] R13: ffff8801d549dc98 R14: 00000000ffffc3db R15: ffffea00063bec80 [ 2924.148500] FS: 00007fa8b2f8a840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 2924.153247] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2924.158003] CR2: ffff88018f0db000 CR3: 000000018f892000 CR4: 00000000000006e0 [ 2924.164641] BUG: Bad rss-counter state mm:00000000fa04621e idx:0 val:4 [ 2924.170007] BUG: Bad rss-counter tate mm:00000000fa04621e idx:1 val:2 - Location https://elixir.bootlin.com/linux/v4.18-rc3/source/fs/f2fs/inline.c#L78 memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); Here the length can be negative. Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ccdf6abde5f7..740988bc250d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -196,6 +196,7 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); unsigned long long iblocks; iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); @@ -237,6 +238,17 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, " + "max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; @@ -305,11 +317,6 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); - return -EINVAL; - } - fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; @@ -329,6 +336,11 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + return -EINVAL; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); -- cgit v1.2.3-59-g8ed1b From 7735730d39d75e70476c1b01435b9b1f41637f0e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 00:02:17 +0800 Subject: f2fs: fix to propagate error from __get_meta_page() If caller of __get_meta_page() can handle error, let's propagate error from __get_meta_page(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 55 ++++++++++++++++++++++++-------- fs/f2fs/data.c | 24 ++++++++++++-- fs/f2fs/f2fs.h | 8 +++-- fs/f2fs/file.c | 7 +++- fs/f2fs/gc.c | 16 ++++++++-- fs/f2fs/inline.c | 14 +++++++- fs/f2fs/inode.c | 12 +++++-- fs/f2fs/node.c | 90 +++++++++++++++++++++++++++++++++++++++++----------- fs/f2fs/recovery.c | 13 +++++++- fs/f2fs/segment.c | 37 +++++++++++++++------ 10 files changed, 220 insertions(+), 56 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b766f78b05f9..c5fd318c06d2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -71,6 +71,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .encrypted_page = NULL, .is_meta = is_meta, }; + int err; if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; @@ -85,11 +86,10 @@ repeat: fio.page = page; - if (f2fs_submit_page_bio(&fio)) { - memset(page_address(page), 0, PAGE_SIZE); - f2fs_stop_checkpoint(sbi, false); - f2fs_bug_on(sbi, 1); - return page; + err = f2fs_submit_page_bio(&fio); + if (err) { + f2fs_put_page(page, 1); + return ERR_PTR(err); } lock_page(page); @@ -98,14 +98,9 @@ repeat: goto repeat; } - /* - * if there is any IO error when accessing device, make our filesystem - * readonly and make sure do not write checkpoint with non-uptodate - * meta page. - */ if (unlikely(!PageUptodate(page))) { - memset(page_address(page), 0, PAGE_SIZE); - f2fs_stop_checkpoint(sbi, false); + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); } out: return page; @@ -116,6 +111,25 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, true); } +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + int count = 0; + +retry: + page = __get_meta_page(sbi, index, true); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EIO && + ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; + + f2fs_stop_checkpoint(sbi, false); + f2fs_bug_on(sbi, 1); + } + + return page; +} + /* for POR only */ struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { @@ -607,7 +621,9 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni); + if (err) + goto err_out; /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { @@ -655,9 +671,15 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { - struct page *page = f2fs_get_meta_page(sbi, start_blk + i); + struct page *page; struct f2fs_orphan_block *orphan_blk; + page = f2fs_get_meta_page(sbi, start_blk + i); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); @@ -748,6 +770,9 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, __u32 crc = 0; *cp_page = f2fs_get_meta_page(sbi, cp_addr); + if (IS_ERR(*cp_page)) + return PTR_ERR(*cp_page); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); @@ -873,6 +898,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned char *ckpt = (unsigned char *)sbi->ckpt; cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); + if (IS_ERR(cur_page)) + goto free_fail_no_cp; sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2b28f0a6f751..7f860405cd6e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -879,6 +879,10 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; + dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) @@ -888,7 +892,6 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; alloc: - f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, @@ -1291,7 +1294,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + @@ -1318,7 +1325,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } phys = (__u64)blk_to_logical(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; @@ -1705,6 +1716,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct inode *inode = page->mapping->host; struct dnode_of_data dn; struct extent_info ei = {0,0,0}; + struct node_info ni; bool ipu_force = false; int err = 0; @@ -1773,6 +1785,12 @@ got_it: fio->need_lock = LOCK_REQ; } + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + if (err) + goto out_writepage; + + fio->version = ni.version; + err = encrypt_one_page(fio); if (err) goto out_writepage; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e6b27596b26..1f692f1445d7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -513,6 +513,8 @@ enum { */ }; +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -1020,6 +1022,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ + unsigned char version; /* version of the node */ }; #define is_read_io(rw) ((rw) == READ) @@ -2823,7 +2826,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); -void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); @@ -2850,7 +2853,7 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); @@ -2928,6 +2931,7 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5e29d4053748..d41f2138b2a9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1067,7 +1067,12 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni); + if (ret) { + f2fs_put_dnode(&dn); + return ret; + } + ilen = min((pgoff_t) ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 37ab2d10a872..e352fbd33848 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -517,7 +517,11 @@ next_step: continue; } - f2fs_get_node_info(sbi, nid, &ni); + if (f2fs_get_node_info(sbi, nid, &ni)) { + f2fs_put_page(node_page, 1); + continue; + } + if (ni.blk_addr != start_addr + off) { f2fs_put_page(node_page, 1); continue; @@ -576,7 +580,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - f2fs_get_node_info(sbi, nid, dni); + if (f2fs_get_node_info(sbi, nid, dni)) { + f2fs_put_page(node_page, 1); + return false; + } if (sum->version != dni->version) { f2fs_msg(sbi->sb, KERN_WARNING, @@ -655,7 +662,10 @@ static void move_data_block(struct inode *inode, block_t bidx, */ f2fs_wait_on_page_writeback(page, DATA, true); - f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + if (err) + goto put_out; + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2bcb2d36f024..115dc219344b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -121,6 +121,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .encrypted_page = NULL, .io_type = FS_DATA_IO, }; + struct node_info ni; int dirty, err; if (!f2fs_exist_data(dn->inode)) @@ -130,6 +131,14 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + if (err) { + f2fs_put_dnode(dn); + return err; + } + + fio.version = ni.version; + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { f2fs_put_dnode(dn); set_sbi_flag(fio.sbi, SBI_NEED_FSCK); @@ -690,7 +699,10 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + if (err) + goto out; + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 740988bc250d..35d49528b2c1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -699,6 +699,7 @@ void f2fs_handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + int err; /* * clear nlink of inode in order to release resource of inode @@ -721,10 +722,16 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "May loss orphan inode, run fsck to fix."); + goto out; + } if (ni.blk_addr != NULL_ADDR) { - int err = f2fs_acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, @@ -737,6 +744,7 @@ void f2fs_handle_failed_inode(struct inode *inode) set_inode_flag(inode, FI_FREE_NID); } +out: f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b18b7522c4d5..69d0ac1b6cac 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -113,7 +113,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { pgoff_t index = current_nat_addr(sbi, nid); - return f2fs_get_meta_page(sbi, index); + return f2fs_get_meta_page_nofail(sbi, index); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -419,7 +419,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) /* * This function always returns success */ -void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -443,7 +443,7 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); up_read(&nm_i->nat_tree_lock); - return; + return 0; } memset(&ne, 0, sizeof(struct f2fs_nat_entry)); @@ -466,6 +466,9 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, up_read(&nm_i->nat_tree_lock); page = f2fs_get_meta_page(sbi, index); + if (IS_ERR(page)) + return PTR_ERR(page); + nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); @@ -473,6 +476,7 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, cache: /* cache nat entry */ cache_nat_entry(sbi, nid, &ne); + return 0; } /* @@ -722,12 +726,15 @@ release_out: return err; } -static void truncate_node(struct dnode_of_data *dn) +static int truncate_node(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; + int err; - f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; /* Deallocate node address */ f2fs_invalidate_blocks(sbi, ni.blk_addr); @@ -750,11 +757,14 @@ static void truncate_node(struct dnode_of_data *dn) dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); + + return 0; } static int truncate_dnode(struct dnode_of_data *dn) { struct page *page; + int err; if (dn->nid == 0) return 1; @@ -770,7 +780,10 @@ static int truncate_dnode(struct dnode_of_data *dn) dn->node_page = page; dn->ofs_in_node = 0; f2fs_truncate_data_blocks(dn); - truncate_node(dn); + err = truncate_node(dn); + if (err) + return err; + return 1; } @@ -835,7 +848,9 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (!ofs) { /* remove current indirect node */ dn->node_page = page; - truncate_node(dn); + ret = truncate_node(dn); + if (ret) + goto out_err; freed++; } else { f2fs_put_page(page, 1); @@ -893,7 +908,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, if (offset[idx + 1] == 0) { dn->node_page = pages[idx]; dn->nid = nid[idx]; - truncate_node(dn); + err = truncate_node(dn); + if (err) + goto fail; } else { f2fs_put_page(pages[idx], 1); } @@ -1014,6 +1031,7 @@ int f2fs_truncate_xattr_node(struct inode *inode) nid_t nid = F2FS_I(inode)->i_xattr_nid; struct dnode_of_data dn; struct page *npage; + int err; if (!nid) return 0; @@ -1022,10 +1040,15 @@ int f2fs_truncate_xattr_node(struct inode *inode) if (IS_ERR(npage)) return PTR_ERR(npage); + set_new_dnode(&dn, inode, NULL, npage, nid); + err = truncate_node(&dn); + if (err) { + f2fs_put_page(npage, 1); + return err; + } + f2fs_i_xnid_write(inode, 0); - set_new_dnode(&dn, inode, NULL, npage, nid); - truncate_node(&dn); return 0; } @@ -1059,7 +1082,11 @@ int f2fs_remove_inode_page(struct inode *inode) inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ - truncate_node(&dn); + err = truncate_node(&dn); + if (err) { + f2fs_put_dnode(&dn); + return err; + } return 0; } @@ -1092,7 +1119,11 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + if (err) { + dec_valid_node_count(sbi, dn->inode, !ofs); + goto fail; + } f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); #endif new_ni.nid = dn->nid; @@ -1140,6 +1171,7 @@ static int read_node_page(struct page *page, int op_flags) .page = page, .encrypted_page = NULL, }; + int err; if (PageUptodate(page)) { #ifdef CONFIG_F2FS_CHECK_FS @@ -1148,7 +1180,9 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni); + if (err) + return err; if (unlikely(ni.blk_addr == NULL_ADDR) || is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { @@ -1383,6 +1417,9 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); + if (f2fs_get_node_info(sbi, nid, &ni)) + goto redirty_out; + if (wbc->for_reclaim) { if (!down_read_trylock(&sbi->node_write)) goto redirty_out; @@ -1390,8 +1427,6 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, down_read(&sbi->node_write); } - f2fs_get_node_info(sbi, nid, &ni); - /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); @@ -2311,12 +2346,16 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) struct dnode_of_data dn; struct node_info ni; struct page *xpage; + int err; if (!prev_xnid) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni); + if (err) + return err; + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); @@ -2351,8 +2390,11 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; + int err; - f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni); + if (err) + return err; if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; @@ -2406,7 +2448,7 @@ retry: return 0; } -void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2428,6 +2470,9 @@ void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, for (idx = addr; idx < addr + nrpages; idx++) { struct page *page = f2fs_get_tmp_page(sbi, idx); + if (IS_ERR(page)) + return PTR_ERR(page); + rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; @@ -2439,6 +2484,7 @@ void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, invalidate_mapping_pages(META_MAPPING(sbi), addr, addr + nrpages); } + return 0; } static void remove_nats_in_journal(struct f2fs_sb_info *sbi) @@ -2675,7 +2721,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { - struct page *page = f2fs_get_meta_page(sbi, nat_bits_addr++); + struct page *page; + + page = f2fs_get_meta_page(sbi, nat_bits_addr++); + if (IS_ERR(page)) { + disable_nat_bits(sbi, true); + return PTR_ERR(page); + } memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 0d927ae26c48..956f34c87082 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -256,6 +256,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, return 0; page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } if (!is_recoverable_dnode(page)) break; @@ -471,7 +475,10 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) + goto err; + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); @@ -574,6 +581,10 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, f2fs_ra_meta_pages_cond(sbi, blkaddr); page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f5f04aabe338..65dcde1d4fb8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,7 +250,13 @@ retry: err = -EAGAIN; goto next; } - f2fs_get_node_info(sbi, dn.nid, &ni); + + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + if (cur->old_addr == NEW_ADDR) { f2fs_invalidate_blocks(sbi, dn.data_blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -2051,7 +2057,7 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); + return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, @@ -2911,11 +2917,9 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; - struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); - f2fs_get_node_info(sbi, dn->nid, &ni); - set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); @@ -3077,7 +3081,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static void read_compacted_summaries(struct f2fs_sb_info *sbi) +static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; @@ -3089,6 +3093,8 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) start = start_sum_block(sbi); page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ @@ -3129,11 +3135,14 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) page = NULL; page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); offset = 0; } } f2fs_put_page(page, 1); + return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) @@ -3145,6 +3154,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; + int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { @@ -3168,6 +3178,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) } new = f2fs_get_meta_page(sbi, blk_addr); + if (IS_ERR(new)) + return PTR_ERR(new); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { @@ -3179,7 +3191,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - f2fs_restore_node_summary(sbi, segno, sum); + err = f2fs_restore_node_summary(sbi, segno, sum); + if (err) + goto out; } } @@ -3199,8 +3213,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); +out: f2fs_put_page(new, 1); - return 0; + return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) @@ -3218,7 +3233,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) META_CP, true); /* restore for compacted data summary */ - read_compacted_summaries(sbi); + err = read_compacted_summaries(sbi); + if (err) + return err; type = CURSEG_HOT_NODE; } @@ -3349,7 +3366,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, -- cgit v1.2.3-59-g8ed1b From 5d3ce4f70172160625e9c18600fa3b929781c4fd Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 19 Jul 2018 09:23:57 +0900 Subject: f2fs: avoid duplicated permission check for "trusted." xattrs Because xattr_permission already checks CAP_SYS_ADMIN capability, we don't need to check it. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 708271871f94..4b34244dcc69 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -37,9 +37,6 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: @@ -62,9 +59,6 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: -- cgit v1.2.3-59-g8ed1b From 6122003a1a612a315c719c6418a527442684903d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:45 +0800 Subject: f2fs: kill EXT_TREE_VEC_SIZE Since commit 201ef5e080c9 ("f2fs: improve shrink performance of extent nodes"), there is no user of EXT_TREE_VEC_SIZE, just kill it for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f692f1445d7..5ed85e2b36cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -519,9 +519,6 @@ enum { #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ -/* vector size for gang look-up from extent cache that consists of radix tree */ -#define EXT_TREE_VEC_SIZE 64 - /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ -- cgit v1.2.3-59-g8ed1b From 80551d1773801d5ebe71803cd8c469b8337992e0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:46 +0800 Subject: f2fs: clean up with get_current_nat_page Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 69d0ac1b6cac..9d9f4c9750c4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -112,25 +112,22 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - pgoff_t index = current_nat_addr(sbi, nid); - return f2fs_get_meta_page_nofail(sbi, index); + return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { struct page *src_page; struct page *dst_page; - pgoff_t src_off; pgoff_t dst_off; void *src_addr; void *dst_addr; struct f2fs_nm_info *nm_i = NM_I(sbi); - src_off = current_nat_addr(sbi, nid); - dst_off = next_nat_addr(sbi, src_off); + dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid)); /* get current nat block page with lock */ - src_page = f2fs_get_meta_page(sbi, src_off); + src_page = get_current_nat_page(sbi, nid); dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); -- cgit v1.2.3-59-g8ed1b From 5b72d5e0df787c38cbd9c3e31c5ed197928ef612 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:47 +0800 Subject: f2fs: clean up with f2fs_encrypted_inode() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d41f2138b2a9..22c1e5a72855 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1615,7 +1615,7 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags = fi->i_flags; - if (file_is_encrypt(inode)) + if (f2fs_encrypted_inode(inode)) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; -- cgit v1.2.3-59-g8ed1b From 2079f115e7b6172eb65d2338136117c37a0e284f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:48 +0800 Subject: f2fs: clean up with f2fs_is_{atomic,volatile}_file() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 65dcde1d4fb8..0b6827dba25e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2720,8 +2720,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - is_inode_flag_set(inode, FI_ATOMIC_FILE) || - is_inode_flag_set(inode, FI_VOLATILE_FILE)) + f2fs_is_atomic_file(inode) || + f2fs_is_volatile_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { -- cgit v1.2.3-59-g8ed1b From 059c0648c6aeb254f5258eb6b058949ef49cb993 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Jul 2018 20:41:49 +0800 Subject: f2fs: clean up ioctl interface naming Romve redundant prefix 'f2fs_' in the middle of f2fs_ioc_f2fs_write_checkpoint(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 22c1e5a72855..ff2cb8fb6934 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2129,7 +2129,7 @@ out: return ret; } -static int f2fs_ioc_f2fs_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2910,7 +2910,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_f2fs_write_checkpoint(filp, arg); + return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: -- cgit v1.2.3-59-g8ed1b From e6b0b159cf2f62de69561f585fe8515d3d9189d4 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 19 Jul 2018 14:57:14 +0800 Subject: f2fs: fix wrong kernel message when recover fsync data on ro fs This patch fix wrong message info for recover fsync data on readonly fs. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 956f34c87082..64e5a59a270a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -639,7 +639,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) #endif if (s_flags & SB_RDONLY) { - f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + f2fs_msg(sbi->sb, KERN_INFO, + "recover fsync data on readonly fs"); sbi->sb->s_flags &= ~SB_RDONLY; } -- cgit v1.2.3-59-g8ed1b From 797c1cb56ba58bf42742e9446226345a6216d832 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 19 Jul 2018 23:57:54 +0800 Subject: f2fs: restrict setting up inode.i_advise In order to give advise to f2fs to recognize hot/cold file, it is possible that we can set specific bit in inode.i_advise through setxattr(), but there are several bits which are used internally, such as encrypt_bit, keep_size_bit, they should never be changed through setxattr(). So that this patch 1) adds FADVISE_MODIFIABLE_BITS to filter modifiable bits user given, 2) supports to clear {hot,cold}_file bits. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/xattr.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5ed85e2b36cd..ff8f8d93acac 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -604,6 +604,8 @@ enum { #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 /* reserved */ +#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) + #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 4b34244dcc69..77a010e625f5 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -94,12 +94,22 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { + unsigned char old_advise = F2FS_I(inode)->i_advise; + unsigned char new_advise; + if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) return -EINVAL; - F2FS_I(inode)->i_advise |= *(char *)value; + new_advise = *(char *)value; + if (new_advise & ~FADVISE_MODIFIABLE_BITS) + return -EINVAL; + + new_advise = new_advise & FADVISE_MODIFIABLE_BITS; + new_advise |= old_advise & ~FADVISE_MODIFIABLE_BITS; + + F2FS_I(inode)->i_advise = new_advise; f2fs_mark_inode_dirty_sync(inode, true); return 0; } -- cgit v1.2.3-59-g8ed1b From 455e3a5887ee7ebec5c885a8f398c2c3c0a33165 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 27 Jul 2018 18:15:11 +0900 Subject: f2fs: don't allow any writes on aborted atomic writes In order to prevent abusing atomic writes by abnormal users, we've added a threshold, 20% over memory footprint, which disallows further atomic writes. Previously, however, SQLite doesn't know the files became normal, so that it could write stale data and commit on revoked normal database file. Once f2fs detects such the abnormal behavior, this patch tries to avoid further writes in write_begin(). Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +++-- fs/f2fs/file.c | 7 ++++++- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7f860405cd6e..5d152de30449 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2291,8 +2291,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); - if (f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) { + if ((f2fs_is_atomic_file(inode) && + !f2fs_available_free_memory(sbi, INMEM_PAGES)) || + is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { err = -ENOMEM; drop_atomic = true; goto fail; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ff2cb8fb6934..c2c47f3248c4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1708,8 +1708,11 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) + ret = -EINVAL; goto out; + } ret = f2fs_convert_inline_inode(inode); if (ret) @@ -1871,6 +1874,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + inode_unlock(inode); mnt_drop_write_file(filp); -- cgit v1.2.3-59-g8ed1b From ad6672bbc527727dc8968e8d92687f55ae928ce5 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 19 Jul 2018 20:58:15 +0800 Subject: f2fs: issue discard align to section in LFS mode For the case when sbi->segs_per_sec > 1 with lfs mode, take section:segment = 5 for example, if the section prefree_map is ...previous section | current section (1 1 0 1 1) | next section..., then the start = x, end = x + 1, after start = start_segno + sbi->segs_per_sec, start = x + 5, then it will skip x + 3 and x + 4, but their bitmap is still set, which will cause duplicated f2fs_issue_discard of this same section in the next write_checkpoint: round 1: section bitmap : 1 1 1 1 1, all valid, prefree_map: 0 0 0 0 0 then rm data block NO.2, block NO.2 becomes invalid, prefree_map: 0 0 1 0 0 write_checkpoint: section bitmap: 1 1 0 1 1, prefree_map: 0 0 0 0 0, prefree of NO.2 is cleared, and no discard issued round 2: rm data block NO.0, NO.1, NO.3, NO.4 all invalid, but prefree bit of NO.2 is set and cleared in round 1, then prefree_map: 1 1 0 1 1 write_checkpoint: section bitmap: 0 0 0 0 0, prefree_map: 0 0 0 1 1, no valid blocks of this section, so discard issued, but this time prefree bit of NO.3 and NO.4 is skipped due to start = start_segno + sbi->segs_per_sec; round 3: write_checkpoint: section bitmap: 0 0 0 0 0, prefree_map: 0 0 0 1 1 -> 0 0 0 0 0, no valid blocks of this section, so discard issued, this time prefree bit of NO.3 and NO.4 is cleared, but the discard of this section is sent again... To fix this problem, we can align the start and end value to section boundary for fstrim and real-time discard operation, and decide to issue discard only when the whole section is invalid, which can issue discard aligned to section size as much as possible and avoid redundant discard. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0b6827dba25e..631e15345752 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1715,21 +1715,30 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; + + if (need_align && end != -1) + end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) break; end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); - for (i = start; i < end; i++) - clear_bit(i, prefree_map); + if (need_align) { + start = rounddown(start, sbi->segs_per_sec); + end = roundup(end, sbi->segs_per_sec); + } - dirty_i->nr_dirty[PRE] -= end - start; + for (i = start; i < end; i++) { + if (test_and_clear_bit(i, prefree_map)) + dirty_i->nr_dirty[PRE]--; + } if (!test_opt(sbi, DISCARD)) continue; @@ -2516,6 +2525,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -2533,6 +2543,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + if (need_align) { + start_segno = rounddown(start_segno, sbi->segs_per_sec); + end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1; + } cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); -- cgit v1.2.3-59-g8ed1b From fd8c8caf7e7c8261a92ce0f7f2cd0adb8afd9e0d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Jul 2018 19:16:21 +0800 Subject: f2fs: let checkpoint flush dnode page of regular Fsyncer will wait on all dnode pages of regular writeback before flushing, if there are async dnode pages blocked by IO scheduler, it may decrease fsync's performance. In this patch, we choose to let f2fs_balance_fs_bg() to trigger checkpoint to flush these dnode pages of regular, so async IO of dnode page can be elimitnated, making fsyncer only need to wait for sync IO. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 +++++++- fs/f2fs/node.h | 5 +++++ fs/f2fs/segment.c | 4 +++- 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9d9f4c9750c4..6055d2d12640 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1410,6 +1410,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; + if (wbc->sync_mode == WB_SYNC_NONE && + IS_DNODE(page) && is_cold_node(page)) + goto redirty_out; + /* get old block addr of this node page */ nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); @@ -1727,10 +1731,12 @@ continue_unlock: } if (step < 2) { + if (wbc->sync_mode == WB_SYNC_NONE && step == 1) + goto out; step++; goto next_step; } - +out: if (nwritten) f2fs_submit_merged_write(sbi, NODE); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 8f34bdffde93..0f4db7a61254 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -135,6 +135,11 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; } +static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) +{ + return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 631e15345752..3662e1f429b4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -509,7 +509,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi) && !excess_dirty_nats(sbi)) + if (!is_idle(sbi) && + (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -517,6 +518,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) !f2fs_available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || + excess_dirty_nodes(sbi) || f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; -- cgit v1.2.3-59-g8ed1b From 970e348d9829e9fc3091da2fe189ab34a07201e6 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 23 Jul 2018 22:10:22 +0800 Subject: f2fs: add proc entry to show victim_secmap bitmap This patch adds a new proc entry to show victim_secmap information in more detail, which is very helpful to know the get_victim candidate status clearly, and helpful to debug problems (e.g., some sections can not gc all of its blocks, since some blocks belong to atomic file, leaving victim_secmap with section bit setting, in extrem case, this will lead all bytes of victim_secmap setting with 0xff). Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index bca1236fd6fa..f22782a0defe 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -615,6 +615,28 @@ static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + seq_puts(seq, "format: victim_secmap bitmaps\n"); + + for (i = 0; i < MAIN_SECS(sbi); i++) { + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0); + if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -664,6 +686,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) segment_bits_seq_show, sb); proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, iostat_info_seq_show, sb); + proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, + victim_bits_seq_show, sb); } return 0; } @@ -674,6 +698,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); -- cgit v1.2.3-59-g8ed1b From 955ac6e523444493298016523788bb7ec1a2f06e Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 24 Jul 2018 20:17:53 +0800 Subject: f2fs: quota: decrease the lock granularity of statfs_project According to fs/quota/dquot.c, `dq_data_lock' protects mem_dqinfo structures and modifications of dquot pointers in the inode, and `dquot->dq_dqb_lock' protects data from dq_dqb. We should use dquot->dq_dqb_lock in statfs_project instead of dq_dat_lock. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 609ea8736dbf..bdea57067bc0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1117,7 +1117,7 @@ static int f2fs_statfs_project(struct super_block *sb, dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); limit = (dquot->dq_dqb.dqb_bsoftlimit ? dquot->dq_dqb.dqb_bsoftlimit : @@ -1140,7 +1140,7 @@ static int f2fs_statfs_project(struct super_block *sb, (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } -- cgit v1.2.3-59-g8ed1b From 76cf05d79c9ef02d5afa77abaf9b109afde794c4 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 26 Jul 2018 19:24:25 +0800 Subject: f2fs: quota: fix incorrect comments Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- fs/f2fs/super.c | 5 +---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c5fd318c06d2..3f11b3abd766 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -661,7 +661,10 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= SB_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bdea57067bc0..26a767355104 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2955,10 +2955,7 @@ try_onemore: goto free_root_inode; #ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ + /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) { -- cgit v1.2.3-59-g8ed1b From 00960c2cd8f169e38700956d3e7ff07bfa4d7b3b Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 24 Jul 2018 20:17:52 +0800 Subject: f2fs: quota: do not mount as RDWR without QUOTA if quota feature enabled If quota feature is enabled, quota is on by default. However, if CONFIG_QUOTA is not built in kernel, dquot entries will not get updated, which leads to quota inconsistency. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 26a767355104..d10c9a57a15d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -346,12 +346,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "QUOTA feature is enabled, so ignore jquota_fmt"); F2FS_OPTION(sbi).s_jquota_fmt = 0; } - if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { - f2fs_msg(sbi->sb, KERN_INFO, - "Filesystem with quota feature cannot be mounted RDWR " - "without CONFIG_QUOTA"); - return -1; - } return 0; } #endif @@ -774,6 +768,13 @@ static int parse_options(struct super_block *sb, char *options) #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; +#else + if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -EINVAL; + } #endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { -- cgit v1.2.3-59-g8ed1b From 4ddc1b28aac57a90c6426d55e0dea3c1b5eb4782 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Jul 2018 07:19:48 +0800 Subject: f2fs: fix to restrict mount condition when without CONFIG_QUOTA Like quota_ino feature, we need to reject mounting RDWR with image which enables project_quota feature when there is no CONFIG_QUOTA be set in kernel. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d10c9a57a15d..449b09498ddb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -775,6 +775,12 @@ static int parse_options(struct super_block *sb, char *options) "without CONFIG_QUOTA"); return -EINVAL; } + if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be " + "mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } #endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { -- cgit v1.2.3-59-g8ed1b From 18767e62639622554c3642a6e3f3b0ca19bc1d9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 Jul 2018 18:15:13 +0800 Subject: f2fs: don't keep meta pages used for block migration For migration of encrypted inode's block, we load data of encrypted block into meta inode's page cache, after checkpoint, those all intermediate pages should be clean, and no one will read them again, so let's just release them for more memory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3f11b3abd766..1425aced91ec 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1408,6 +1408,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); + /* + * invalidate intermediate page cache borrowed from meta inode + * which are used for migration of encrypted inode's blocks. + */ + if (f2fs_sb_has_encrypt(sbi->sb)) + invalidate_mapping_pages(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); + f2fs_release_ino_entry(sbi, false); clear_sbi_flag(sbi, SBI_IS_DIRTY); -- cgit v1.2.3-59-g8ed1b From 82cf4f132e6d16dca6fc3bd955019246141bc645 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 Jul 2018 18:15:14 +0800 Subject: f2fs: fix to active page in lru list for read path If config CONFIG_F2FS_FAULT_INJECTION is on, for both read or write path we will call find_lock_page() to get the page, but for read path, it missed to passing FGP_ACCESSED to allocator to active the page in LRU list, result in being reclaimed in advance incorrectly, fix it. Reported-by: Xianrong Zhou Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff8f8d93acac..a9447c7d6570 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1964,8 +1964,13 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { #ifdef CONFIG_F2FS_FAULT_INJECTION - struct page *page = find_lock_page(mapping, index); + struct page *page; + if (!for_write) + page = find_get_page_flags(mapping, index, + FGP_LOCK | FGP_ACCESSED); + else + page = find_lock_page(mapping, index); if (page) return page; -- cgit v1.2.3-59-g8ed1b From 66110abc4c931f879d70e83e1281f891699364bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 29 Jul 2018 12:16:59 +0800 Subject: f2fs: fix to clear PG_checked flag in set_page_dirty() PG_checked flag will be set on data page during GC, later, we can recognize such page by the flag and migrate page to cold segment. But previously, we don't clear this flag when invalidating data page, after page redirtying, we will write it into wrong log. Let's clear PG_checked flag in set_page_dirty() to avoid this. Signed-off-by: Weichao Guo Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5d152de30449..b7986b2e5d1d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2542,6 +2542,10 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); + /* don't remain PG_checked flag which was set during GC */ + if (is_cold_data(page)) + clear_cold_data(page); + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { f2fs_register_inmem_page(inode, page); -- cgit v1.2.3-59-g8ed1b From 8d714f8aa34cf6a49b8d8d291ab01a59699604bd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 31 Jul 2018 09:09:01 -0700 Subject: f2fs: avoid f2fs_bug_on() in cp_error case There is a subtle race condition to invoke f2fs_bug_on() in shutdown tests. I've confirmed that the last checkpoint is preserved in consistent state, so it'd be fine to just return error at this moment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6055d2d12640..21ffb784764c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1075,6 +1075,10 @@ int f2fs_remove_inode_page(struct inode *inode) f2fs_truncate_data_blocks_range(&dn, 1); /* 0 is possible, after f2fs_new_inode() has failed */ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + f2fs_put_dnode(&dn); + return -EIO; + } f2fs_bug_on(F2FS_I_SB(inode), inode->i_blocks != 0 && inode->i_blocks != 8); -- cgit v1.2.3-59-g8ed1b From e494c2f995d6181d6e29c4927d68e0f295ecf75b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Aug 2018 19:16:11 +0800 Subject: f2fs: fix to do sanity check with cp_pack_start_sum After fuzzing, cp_pack_start_sum could be corrupted, so current log's summary info should be wrong due to loading incorrect summary block. Then, if segment's type in current log is exceeded NR_CURSEG_TYPE, it can lead accessing invalid dirty_i->dirty_segmap bitmap finally. Add sanity check for cp_pack_start_sum to fix this issue. https://bugzilla.kernel.org/show_bug.cgi?id=200419 - Reproduce - Kernel message (f2fs-dev w/ KASAN) [ 3117.578432] F2FS-fs (loop0): Invalid log blocks per segment (8) [ 3117.578445] F2FS-fs (loop0): Can't find valid F2FS filesystem in 2th superblock [ 3117.581364] F2FS-fs (loop0): invalid crc_offset: 30716 [ 3117.583564] WARNING: CPU: 1 PID: 1225 at fs/f2fs/checkpoint.c:90 __get_meta_page+0x448/0x4b0 [ 3117.583570] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy [ 3117.584014] CPU: 1 PID: 1225 Comm: mount Not tainted 4.17.0+ #1 [ 3117.584017] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 3117.584022] RIP: 0010:__get_meta_page+0x448/0x4b0 [ 3117.584023] Code: 00 49 8d bc 24 84 00 00 00 e8 74 54 da ff 41 83 8c 24 84 00 00 00 08 4c 89 f6 4c 89 ef e8 c0 d9 95 00 48 89 ef e8 18 e3 00 00 <0f> 0b f0 80 4d 48 04 e9 0f fe ff ff 0f 0b 48 89 c7 48 89 04 24 e8 [ 3117.584072] RSP: 0018:ffff88018eb678c0 EFLAGS: 00010286 [ 3117.584082] RAX: ffff88018f0a6a78 RBX: ffffea0007a46600 RCX: ffffffff9314d1b2 [ 3117.584085] RDX: ffffffff00000001 RSI: 0000000000000000 RDI: ffff88018f0a6a98 [ 3117.584087] RBP: ffff88018ebe9980 R08: 0000000000000002 R09: 0000000000000001 [ 3117.584090] R10: 0000000000000001 R11: ffffed00326e4450 R12: ffff880193722200 [ 3117.584092] R13: ffff88018ebe9afc R14: 0000000000000206 R15: ffff88018eb67900 [ 3117.584096] FS: 00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 3117.584098] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3117.584101] CR2: 00000000016f21b8 CR3: 0000000191c22000 CR4: 00000000000006e0 [ 3117.584112] Call Trace: [ 3117.584121] ? f2fs_set_meta_page_dirty+0x150/0x150 [ 3117.584127] ? f2fs_build_segment_manager+0xbf9/0x3190 [ 3117.584133] ? f2fs_npages_for_summary_flush+0x75/0x120 [ 3117.584145] f2fs_build_segment_manager+0xda8/0x3190 [ 3117.584151] ? f2fs_get_valid_checkpoint+0x298/0xa00 [ 3117.584156] ? f2fs_flush_sit_entries+0x10e0/0x10e0 [ 3117.584184] ? map_id_range_down+0x17c/0x1b0 [ 3117.584188] ? __put_user_ns+0x30/0x30 [ 3117.584206] ? find_next_bit+0x53/0x90 [ 3117.584237] ? cpumask_next+0x16/0x20 [ 3117.584249] f2fs_fill_super+0x1948/0x2b40 [ 3117.584258] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.584279] ? sget_userns+0x65e/0x690 [ 3117.584296] ? set_blocksize+0x88/0x130 [ 3117.584302] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.584305] mount_bdev+0x1c0/0x200 [ 3117.584310] mount_fs+0x5c/0x190 [ 3117.584320] vfs_kern_mount+0x64/0x190 [ 3117.584330] do_mount+0x2e4/0x1450 [ 3117.584343] ? lockref_put_return+0x130/0x130 [ 3117.584347] ? copy_mount_string+0x20/0x20 [ 3117.584357] ? kasan_unpoison_shadow+0x31/0x40 [ 3117.584362] ? kasan_kmalloc+0xa6/0xd0 [ 3117.584373] ? memcg_kmem_put_cache+0x16/0x90 [ 3117.584377] ? __kmalloc_track_caller+0x196/0x210 [ 3117.584383] ? _copy_from_user+0x61/0x90 [ 3117.584396] ? memdup_user+0x3e/0x60 [ 3117.584401] ksys_mount+0x7e/0xd0 [ 3117.584405] __x64_sys_mount+0x62/0x70 [ 3117.584427] do_syscall_64+0x73/0x160 [ 3117.584440] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.584455] RIP: 0033:0x7f5693f14b9a [ 3117.584456] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 3117.584505] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 3117.584510] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a [ 3117.584512] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040 [ 3117.584514] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 3117.584516] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040 [ 3117.584519] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003 [ 3117.584523] ---[ end trace a8e0d899985faf31 ]--- [ 3117.685663] F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=2, run fsck to fix. [ 3117.685673] F2FS-fs (loop0): recover_data: ino = 2 (i_size: recover) recovered = 1, err = 0 [ 3117.685707] ================================================================== [ 3117.685955] BUG: KASAN: slab-out-of-bounds in __remove_dirty_segment+0xdd/0x1e0 [ 3117.686175] Read of size 8 at addr ffff88018f0a63d0 by task mount/1225 [ 3117.686477] CPU: 0 PID: 1225 Comm: mount Tainted: G W 4.17.0+ #1 [ 3117.686481] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 3117.686483] Call Trace: [ 3117.686494] dump_stack+0x71/0xab [ 3117.686512] print_address_description+0x6b/0x290 [ 3117.686517] kasan_report+0x28e/0x390 [ 3117.686522] ? __remove_dirty_segment+0xdd/0x1e0 [ 3117.686527] __remove_dirty_segment+0xdd/0x1e0 [ 3117.686532] locate_dirty_segment+0x189/0x190 [ 3117.686538] f2fs_allocate_new_segments+0xa9/0xe0 [ 3117.686543] recover_data+0x703/0x2c20 [ 3117.686547] ? f2fs_recover_fsync_data+0x48f/0xd50 [ 3117.686553] ? ksys_mount+0x7e/0xd0 [ 3117.686564] ? policy_nodemask+0x1a/0x90 [ 3117.686567] ? policy_node+0x56/0x70 [ 3117.686571] ? add_fsync_inode+0xf0/0xf0 [ 3117.686592] ? blk_finish_plug+0x44/0x60 [ 3117.686597] ? f2fs_ra_meta_pages+0x38b/0x5e0 [ 3117.686602] ? find_inode_fast+0xac/0xc0 [ 3117.686606] ? f2fs_is_valid_blkaddr+0x320/0x320 [ 3117.686618] ? __radix_tree_lookup+0x150/0x150 [ 3117.686633] ? dqget+0x670/0x670 [ 3117.686648] ? pagecache_get_page+0x29/0x410 [ 3117.686656] ? kmem_cache_alloc+0x176/0x1e0 [ 3117.686660] ? f2fs_is_valid_blkaddr+0x11d/0x320 [ 3117.686664] f2fs_recover_fsync_data+0xc23/0xd50 [ 3117.686670] ? f2fs_space_for_roll_forward+0x60/0x60 [ 3117.686674] ? rb_insert_color+0x323/0x3d0 [ 3117.686678] ? f2fs_recover_orphan_inodes+0xa5/0x700 [ 3117.686683] ? proc_register+0x153/0x1d0 [ 3117.686686] ? f2fs_remove_orphan_inode+0x10/0x10 [ 3117.686695] ? f2fs_attr_store+0x50/0x50 [ 3117.686700] ? proc_create_single_data+0x52/0x60 [ 3117.686707] f2fs_fill_super+0x1d06/0x2b40 [ 3117.686728] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.686735] ? sget_userns+0x65e/0x690 [ 3117.686740] ? set_blocksize+0x88/0x130 [ 3117.686745] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.686748] mount_bdev+0x1c0/0x200 [ 3117.686753] mount_fs+0x5c/0x190 [ 3117.686758] vfs_kern_mount+0x64/0x190 [ 3117.686762] do_mount+0x2e4/0x1450 [ 3117.686769] ? lockref_put_return+0x130/0x130 [ 3117.686773] ? copy_mount_string+0x20/0x20 [ 3117.686777] ? kasan_unpoison_shadow+0x31/0x40 [ 3117.686780] ? kasan_kmalloc+0xa6/0xd0 [ 3117.686786] ? memcg_kmem_put_cache+0x16/0x90 [ 3117.686790] ? __kmalloc_track_caller+0x196/0x210 [ 3117.686795] ? _copy_from_user+0x61/0x90 [ 3117.686801] ? memdup_user+0x3e/0x60 [ 3117.686804] ksys_mount+0x7e/0xd0 [ 3117.686809] __x64_sys_mount+0x62/0x70 [ 3117.686816] do_syscall_64+0x73/0x160 [ 3117.686824] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.686829] RIP: 0033:0x7f5693f14b9a [ 3117.686830] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 3117.686887] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 3117.686892] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a [ 3117.686894] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040 [ 3117.686896] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 3117.686899] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040 [ 3117.686901] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003 [ 3117.687005] Allocated by task 1225: [ 3117.687152] kasan_kmalloc+0xa6/0xd0 [ 3117.687157] kmem_cache_alloc_trace+0xfd/0x200 [ 3117.687161] f2fs_build_segment_manager+0x2d09/0x3190 [ 3117.687165] f2fs_fill_super+0x1948/0x2b40 [ 3117.687168] mount_bdev+0x1c0/0x200 [ 3117.687171] mount_fs+0x5c/0x190 [ 3117.687174] vfs_kern_mount+0x64/0x190 [ 3117.687177] do_mount+0x2e4/0x1450 [ 3117.687180] ksys_mount+0x7e/0xd0 [ 3117.687182] __x64_sys_mount+0x62/0x70 [ 3117.687186] do_syscall_64+0x73/0x160 [ 3117.687190] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.687285] Freed by task 19: [ 3117.687412] __kasan_slab_free+0x137/0x190 [ 3117.687416] kfree+0x8b/0x1b0 [ 3117.687460] ttm_bo_man_put_node+0x61/0x80 [ttm] [ 3117.687476] ttm_bo_cleanup_refs+0x15f/0x250 [ttm] [ 3117.687492] ttm_bo_delayed_delete+0x2f0/0x300 [ttm] [ 3117.687507] ttm_bo_delayed_workqueue+0x17/0x50 [ttm] [ 3117.687528] process_one_work+0x2f9/0x740 [ 3117.687531] worker_thread+0x78/0x6b0 [ 3117.687541] kthread+0x177/0x1c0 [ 3117.687545] ret_from_fork+0x35/0x40 [ 3117.687638] The buggy address belongs to the object at ffff88018f0a6300 which belongs to the cache kmalloc-192 of size 192 [ 3117.688014] The buggy address is located 16 bytes to the right of 192-byte region [ffff88018f0a6300, ffff88018f0a63c0) [ 3117.688382] The buggy address belongs to the page: [ 3117.688554] page:ffffea00063c2980 count:1 mapcount:0 mapping:ffff8801f3403180 index:0x0 [ 3117.688788] flags: 0x17fff8000000100(slab) [ 3117.688944] raw: 017fff8000000100 ffffea00063c2840 0000000e0000000e ffff8801f3403180 [ 3117.689166] raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 [ 3117.689386] page dumped because: kasan: bad access detected [ 3117.689653] Memory state around the buggy address: [ 3117.689816] ffff88018f0a6280: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3117.690027] ffff88018f0a6300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 3117.690239] >ffff88018f0a6380: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 3117.690448] ^ [ 3117.690644] ffff88018f0a6400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 3117.690868] ffff88018f0a6480: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 3117.691077] ================================================================== [ 3117.691290] Disabling lock debugging due to kernel taint [ 3117.693893] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 3117.694120] PGD 80000001f01bc067 P4D 80000001f01bc067 PUD 1d9638067 PMD 0 [ 3117.694338] Oops: 0002 [#1] SMP KASAN PTI [ 3117.694490] CPU: 1 PID: 1225 Comm: mount Tainted: G B W 4.17.0+ #1 [ 3117.694703] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 3117.695073] RIP: 0010:__remove_dirty_segment+0xe2/0x1e0 [ 3117.695246] Code: c4 48 89 c7 e8 cf bb d7 ff 45 0f b6 24 24 41 83 e4 3f 44 88 64 24 07 41 83 e4 3f 4a 8d 7c e3 08 e8 b3 bc d7 ff 4a 8b 4c e3 08 4c 0f b3 29 0f 82 94 00 00 00 48 8d bd 20 04 00 00 e8 97 bb d7 [ 3117.695793] RSP: 0018:ffff88018eb67638 EFLAGS: 00010292 [ 3117.695969] RAX: 0000000000000000 RBX: ffff88018f0a6300 RCX: 0000000000000000 [ 3117.696182] RDX: 0000000000000000 RSI: 0000000000000297 RDI: 0000000000000297 [ 3117.696391] RBP: ffff88018ebe9980 R08: ffffed003e743ebb R09: ffffed003e743ebb [ 3117.696604] R10: 0000000000000001 R11: ffffed003e743eba R12: 0000000000000019 [ 3117.696813] R13: 0000000000000014 R14: 0000000000000320 R15: ffff88018ebe99e0 [ 3117.697032] FS: 00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 3117.697280] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3117.702357] CR2: 00007fe89bb1a000 CR3: 0000000191c22000 CR4: 00000000000006e0 [ 3117.707235] Call Trace: [ 3117.712077] locate_dirty_segment+0x189/0x190 [ 3117.716891] f2fs_allocate_new_segments+0xa9/0xe0 [ 3117.721617] recover_data+0x703/0x2c20 [ 3117.726316] ? f2fs_recover_fsync_data+0x48f/0xd50 [ 3117.730957] ? ksys_mount+0x7e/0xd0 [ 3117.735573] ? policy_nodemask+0x1a/0x90 [ 3117.740198] ? policy_node+0x56/0x70 [ 3117.744829] ? add_fsync_inode+0xf0/0xf0 [ 3117.749487] ? blk_finish_plug+0x44/0x60 [ 3117.754152] ? f2fs_ra_meta_pages+0x38b/0x5e0 [ 3117.758831] ? find_inode_fast+0xac/0xc0 [ 3117.763448] ? f2fs_is_valid_blkaddr+0x320/0x320 [ 3117.768046] ? __radix_tree_lookup+0x150/0x150 [ 3117.772603] ? dqget+0x670/0x670 [ 3117.777159] ? pagecache_get_page+0x29/0x410 [ 3117.781648] ? kmem_cache_alloc+0x176/0x1e0 [ 3117.786067] ? f2fs_is_valid_blkaddr+0x11d/0x320 [ 3117.790476] f2fs_recover_fsync_data+0xc23/0xd50 [ 3117.794790] ? f2fs_space_for_roll_forward+0x60/0x60 [ 3117.799086] ? rb_insert_color+0x323/0x3d0 [ 3117.803304] ? f2fs_recover_orphan_inodes+0xa5/0x700 [ 3117.807563] ? proc_register+0x153/0x1d0 [ 3117.811766] ? f2fs_remove_orphan_inode+0x10/0x10 [ 3117.815947] ? f2fs_attr_store+0x50/0x50 [ 3117.820087] ? proc_create_single_data+0x52/0x60 [ 3117.824262] f2fs_fill_super+0x1d06/0x2b40 [ 3117.828367] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.832432] ? sget_userns+0x65e/0x690 [ 3117.836500] ? set_blocksize+0x88/0x130 [ 3117.840501] ? f2fs_commit_super+0x1a0/0x1a0 [ 3117.844420] mount_bdev+0x1c0/0x200 [ 3117.848275] mount_fs+0x5c/0x190 [ 3117.852053] vfs_kern_mount+0x64/0x190 [ 3117.855810] do_mount+0x2e4/0x1450 [ 3117.859441] ? lockref_put_return+0x130/0x130 [ 3117.862996] ? copy_mount_string+0x20/0x20 [ 3117.866417] ? kasan_unpoison_shadow+0x31/0x40 [ 3117.869719] ? kasan_kmalloc+0xa6/0xd0 [ 3117.872948] ? memcg_kmem_put_cache+0x16/0x90 [ 3117.876121] ? __kmalloc_track_caller+0x196/0x210 [ 3117.879333] ? _copy_from_user+0x61/0x90 [ 3117.882467] ? memdup_user+0x3e/0x60 [ 3117.885604] ksys_mount+0x7e/0xd0 [ 3117.888700] __x64_sys_mount+0x62/0x70 [ 3117.891742] do_syscall_64+0x73/0x160 [ 3117.894692] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 3117.897669] RIP: 0033:0x7f5693f14b9a [ 3117.900563] Code: 48 8b 0d 01 c3 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ce c2 2b 00 f7 d8 64 89 01 48 [ 3117.906922] RSP: 002b:00007fff27346488 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 [ 3117.910159] RAX: ffffffffffffffda RBX: 00000000016e2030 RCX: 00007f5693f14b9a [ 3117.913469] RDX: 00000000016e2210 RSI: 00000000016e3f30 RDI: 00000000016ee040 [ 3117.916764] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000013 [ 3117.920071] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 00000000016ee040 [ 3117.923393] R13: 00000000016e2210 R14: 0000000000000000 R15: 0000000000000003 [ 3117.926680] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer joydev input_leds serio_raw snd soundcore mac_hid i2c_piix4 ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 8139too qxl ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel psmouse aes_x86_64 8139cp crypto_simd cryptd mii glue_helper pata_acpi floppy [ 3117.949979] CR2: 0000000000000000 [ 3117.954283] ---[ end trace a8e0d899985faf32 ]--- [ 3117.958575] RIP: 0010:__remove_dirty_segment+0xe2/0x1e0 [ 3117.962810] Code: c4 48 89 c7 e8 cf bb d7 ff 45 0f b6 24 24 41 83 e4 3f 44 88 64 24 07 41 83 e4 3f 4a 8d 7c e3 08 e8 b3 bc d7 ff 4a 8b 4c e3 08 4c 0f b3 29 0f 82 94 00 00 00 48 8d bd 20 04 00 00 e8 97 bb d7 [ 3117.971789] RSP: 0018:ffff88018eb67638 EFLAGS: 00010292 [ 3117.976333] RAX: 0000000000000000 RBX: ffff88018f0a6300 RCX: 0000000000000000 [ 3117.980926] RDX: 0000000000000000 RSI: 0000000000000297 RDI: 0000000000000297 [ 3117.985497] RBP: ffff88018ebe9980 R08: ffffed003e743ebb R09: ffffed003e743ebb [ 3117.990098] R10: 0000000000000001 R11: ffffed003e743eba R12: 0000000000000019 [ 3117.994761] R13: 0000000000000014 R14: 0000000000000320 R15: ffff88018ebe99e0 [ 3117.999392] FS: 00007f5694636840(0000) GS:ffff8801f3b00000(0000) knlGS:0000000000000000 [ 3118.004096] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3118.008816] CR2: 00007fe89bb1a000 CR3: 0000000191c22000 CR4: 00000000000006e0 - Location https://elixir.bootlin.com/linux/v4.18-rc3/source/fs/f2fs/segment.c#L775 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; Here dirty_i->dirty_segmap[t] can be NULL which leads to crash in test_and_clear_bit() Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++---- fs/f2fs/super.c | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1425aced91ec..334c6f90eaf1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -880,15 +880,15 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); - /* Sanity checking of checkpoint */ - if (f2fs_sanity_check_ckpt(sbi)) - goto free_fail_no_cp; - if (cur_page == cp1) sbi->cur_cp_pack = 1; else sbi->cur_cp_pack = 2; + /* Sanity checking of checkpoint */ + if (f2fs_sanity_check_ckpt(sbi)) + goto free_fail_no_cp; + if (cp_blks <= 1) goto done; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 449b09498ddb..879ff1b22357 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2293,6 +2293,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int sit_bitmap_size, nat_bitmap_size; unsigned int log_blocks_per_seg; unsigned int segment_count_main; + unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count; int i; @@ -2353,6 +2354,17 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + cp_pack_start_sum = __start_sum_addr(sbi); + cp_payload = __cp_payload(sbi); + if (cp_pack_start_sum < cp_payload + 1 || + cp_pack_start_sum > blocks_per_seg - 1 - + NR_CURSEG_TYPE) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong cp_pack_start_sum: %u", + cp_pack_start_sum); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; -- cgit v1.2.3-59-g8ed1b From 6e45f2a59ffb440f28719ab3a68bee5b8e9df16b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 1 Aug 2018 19:51:38 -0500 Subject: f2fs: use true and false for boolean values Return statements in functions returning bool should use true or false instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a9447c7d6570..75e81b1af2e8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1334,7 +1334,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi) struct request_list *rl = &q->root_rl; if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) - return 0; + return false; return f2fs_time_over(sbi, REQ_TIME); } @@ -3400,7 +3400,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode) return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else - return 0; + return false; #endif } -- cgit v1.2.3-59-g8ed1b From 50fa53eccf9f911a5b435248a2b0bd484fd82e5e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Aug 2018 23:03:19 +0800 Subject: f2fs: fix to avoid broken of dnode block list f2fs recovery flow is relying on dnode block link list, it means fsynced file recovery depends on previous dnode's persistence in the list, so during fsync() we should wait on all regular inode's dnode writebacked before issuing flush. By this way, we can avoid dnode block list being broken by out-of-order IO submission due to IO scheduler or driver. Sheng Yong helps to do the test with this patch: Target:/data (f2fs, -) 64MB / 32768KB / 4KB / 8 1 / PERSIST / Index Base: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 867.82 204.15 41440.03 41370.54 680.8 1025.94 1031.08 2 871.87 205.87 41370.3 40275.2 791.14 1065.84 1101.7 3 866.52 205.69 41795.67 40596.16 694.69 1037.16 1031.48 Avg 868.7366667 205.2366667 41535.33333 40747.3 722.21 1042.98 1054.753333 After: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 798.81 202.5 41143 40613.87 602.71 838.08 913.83 2 805.79 206.47 40297.2 41291.46 604.44 840.75 924.27 3 814.83 206.17 41209.57 40453.62 602.85 834.66 927.91 Avg 806.4766667 205.0466667 40883.25667 40786.31667 603.3333333 837.83 922.0033333 Patched/Original: 0.928332713 0.999074239 0.984300676 1.000957528 0.835398753 0.803303994 0.874141189 It looks like atomic write will suffer performance regression. I suspect that the criminal is that we forcing to wait all dnode being in storage cache before we issue PREFLUSH+FUA. BTW, will commit ("f2fs: don't need to wait for node writes for atomic write") cause the problem: we will lose data of last transaction after SPO, even if atomic write return no error: - atomic_open(); - write() P1, P2, P3; - atomic_commit(); - writeback data: P1, P2, P3; - writeback node: N1, N2, N3; <--- If N1, N2 is not writebacked, N3 with fsync_mark is writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing last transaction. - preflush + fua; - power-cut If we don't wait dnode writeback for atomic_write: SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS) 1 779.91 206.03 41621.5 40333.16 716.9 1038.21 1034.85 2 848.51 204.35 40082.44 39486.17 791.83 1119.96 1083.77 3 772.12 206.27 41335.25 41599.65 723.29 1055.07 971.92 Avg 800.18 205.55 41013.06333 40472.99333 744.0066667 1071.08 1030.18 Patched/Original: 0.92108464 1.001526693 0.987425886 0.993268102 1.030180511 1.026942031 0.976702294 SQLite's performance recovers. Jaegeuk: "Practically, I don't see db corruption becase of this. We can excuse to lose the last transaction." Finally, we decide to keep original implementation of atomic write interface sematics that we don't wait all dnode writeback before preflush+fua submission. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +-- fs/f2fs/data.c | 2 + fs/f2fs/f2fs.h | 22 +++++++- fs/f2fs/file.c | 5 +- fs/f2fs/node.c | 144 ++++++++++++++++++++++++++++++++++++++++++--------- fs/f2fs/super.c | 6 +++ 6 files changed, 156 insertions(+), 31 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 334c6f90eaf1..551c1d1984ec 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1161,7 +1161,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) f2fs_unlock_all(sbi); } -static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) { DEFINE_WAIT(wait); @@ -1397,7 +1397,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ - wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages_writeback(sbi); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); @@ -1406,7 +1406,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); - wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages_writeback(sbi); /* * invalidate intermediate page cache borrowed from meta inode @@ -1418,6 +1418,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_release_ino_entry(sbi, false); + f2fs_reset_fsync_node_info(sbi); + clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b7986b2e5d1d..363520ee099a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -177,6 +177,8 @@ static void f2fs_write_end_io(struct bio *bio) page->index != nid_of_node(page)); dec_page_count(sbi, type); + if (f2fs_in_warm_node_list(sbi, page)) + f2fs_del_fsync_node_entry(sbi, page); clear_cold_data(page); end_page_writeback(page); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 75e81b1af2e8..1647a13be7f9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -228,6 +228,12 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; +struct fsync_node_entry { + struct list_head list; /* list head */ + struct page *page; /* warm node page pointer */ + unsigned int seq_id; /* sequence id */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1156,6 +1162,11 @@ struct f2fs_sb_info { struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + spinlock_t fsync_node_lock; /* for node entry lock */ + struct list_head fsync_node_list; /* node list head */ + unsigned int fsync_seg_id; /* sequence id */ + unsigned int fsync_node_num; /* number of node entries */ + /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ @@ -2827,6 +2838,10 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); @@ -2836,7 +2851,8 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -2845,7 +2861,8 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); void f2fs_move_node_page(struct page *node_page, int gc_type); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, - struct writeback_control *wbc, bool atomic); + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id); int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); @@ -2962,6 +2979,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c2c47f3248c4..a90b4f24aa28 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; + unsigned int seq_id = 0; if (unlikely(f2fs_readonly(inode->i_sb))) return 0; @@ -275,7 +276,7 @@ go_write: } sync_nodes: atomic_inc(&sbi->wb_sync_req[NODE]); - ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id); atomic_dec(&sbi->wb_sync_req[NODE]); if (ret) goto out; @@ -301,7 +302,7 @@ sync_nodes: * given fsync mark. */ if (!atomic) { - ret = f2fs_wait_on_node_pages_writeback(sbi, ino); + ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id); if (ret) goto out; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 21ffb784764c..81fb2f3edb52 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -28,6 +28,7 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; +static struct kmem_cache *fsync_node_entry_slab; /* * Check whether the given nid is within node id range. @@ -264,6 +265,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +{ + return NODE_MAPPING(sbi) == page->mapping && + IS_DNODE(page) && is_cold_node(page); +} + +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) +{ + spin_lock_init(&sbi->fsync_node_lock); + INIT_LIST_HEAD(&sbi->fsync_node_list); + sbi->fsync_seg_id = 0; + sbi->fsync_node_num = 0; +} + +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, + struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + unsigned int seq_id; + + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + + get_page(page); + fn->page = page; + INIT_LIST_HEAD(&fn->list); + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_add_tail(&fn->list, &sbi->fsync_node_list); + fn->seq_id = sbi->fsync_seg_id++; + seq_id = fn->seq_id; + sbi->fsync_node_num++; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + return seq_id; +} + +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_for_each_entry(fn, &sbi->fsync_node_list, list) { + if (fn->page == page) { + list_del(&fn->list); + sbi->fsync_node_num--; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + kmem_cache_free(fsync_node_entry_slab, fn); + put_page(page); + return; + } + } + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + f2fs_bug_on(sbi, 1); +} + +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + sbi->fsync_seg_id = 0; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); +} + int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1388,7 +1455,7 @@ continue_unlock: static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct writeback_control *wbc, bool do_balance, - enum iostat_type io_type) + enum iostat_type io_type, unsigned int *seq_id) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1405,6 +1472,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .io_type = io_type, .io_wbc = wbc, }; + unsigned int seq; trace_f2fs_writepage(page, NODE); @@ -1450,6 +1518,13 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, set_page_writeback(page); ClearPageError(page); + + if (f2fs_in_warm_node_list(sbi, page)) { + seq = f2fs_add_fsync_node_entry(sbi, page); + if (seq_id) + *seq_id = seq; + } + fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); @@ -1497,7 +1572,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type) goto out_page; if (__write_node_page(node_page, false, NULL, - &wbc, false, FS_GC_NODE_IO)) + &wbc, false, FS_GC_NODE_IO, NULL)) unlock_page(node_page); goto release_page; } else { @@ -1514,11 +1589,13 @@ release_page: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); + return __write_node_page(page, false, NULL, wbc, false, + FS_NODE_IO, NULL); } int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, - struct writeback_control *wbc, bool atomic) + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id) { pgoff_t index; pgoff_t last_idx = ULONG_MAX; @@ -1599,7 +1676,7 @@ continue_unlock: ret = __write_node_page(page, atomic && page == last_page, &submitted, wbc, true, - FS_NODE_IO); + FS_NODE_IO, seq_id); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1716,7 +1793,7 @@ continue_unlock: set_dentry_mark(page, 0); ret = __write_node_page(page, false, &submitted, - wbc, do_balance, io_type); + wbc, do_balance, io_type, NULL); if (ret) unlock_page(page); else if (submitted) @@ -1749,35 +1826,46 @@ out: return ret; } -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id) { - pgoff_t index = 0; - struct pagevec pvec; + struct fsync_node_entry *fn; + struct page *page; + struct list_head *head = &sbi->fsync_node_list; + unsigned long flags; + unsigned int cur_seq_id = 0; int ret2, ret = 0; - int nr_pages; - pagevec_init(&pvec); + while (seq_id && cur_seq_id < seq_id) { + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + fn = list_first_entry(head, struct fsync_node_entry, list); + if (fn->seq_id > seq_id) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + cur_seq_id = fn->seq_id; + page = fn->page; + get_page(page); + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK))) { - int i; + f2fs_wait_on_page_writeback(page, NODE, true); + if (TestClearPageError(page)) + ret = -EIO; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + put_page(page); - if (ino && ino_of_node(page) == ino) { - f2fs_wait_on_page_writeback(page, NODE, true); - if (TestClearPageError(page)) - ret = -EIO; - } - } - pagevec_release(&pvec); - cond_resched(); + if (ret) + break; } ret2 = filemap_check_errors(NODE_MAPPING(sbi)); if (!ret) ret = ret2; + return ret; } @@ -2992,8 +3080,15 @@ int __init f2fs_create_node_manager_caches(void) sizeof(struct nat_entry_set)); if (!nat_entry_set_slab) goto destroy_free_nid; + + fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry", + sizeof(struct fsync_node_entry)); + if (!fsync_node_entry_slab) + goto destroy_nat_entry_set; return 0; +destroy_nat_entry_set: + kmem_cache_destroy(nat_entry_set_slab); destroy_free_nid: kmem_cache_destroy(free_nid_slab); destroy_nat_entry: @@ -3004,6 +3099,7 @@ fail: void f2fs_destroy_node_manager_caches(void) { + kmem_cache_destroy(fsync_node_entry_slab); kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 879ff1b22357..b0e2b017f390 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1036,6 +1036,10 @@ static void f2fs_put_super(struct super_block *sb) /* our cp_error case, we can wait for any writeback page */ f2fs_flush_merged_writes(sbi); + f2fs_wait_on_all_pages_writeback(sbi); + + f2fs_bug_on(sbi, sbi->fsync_node_num); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -2911,6 +2915,8 @@ try_onemore: f2fs_init_ino_entry_info(sbi); + f2fs_init_fsync_node_info(sbi); + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { -- cgit v1.2.3-59-g8ed1b From d3f07c049dab1a3f1740f476afd3d5e5b738c21c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Aug 2018 22:59:12 +0800 Subject: f2fs: fix invalid memory access syzbot found the following crash on: HEAD commit: d9bd94c0bcaa Add linux-next specific files for 20180801 git tree: linux-next console output: https://syzkaller.appspot.com/x/log.txt?x=1001189c400000 kernel config: https://syzkaller.appspot.com/x/.config?x=cc8964ea4d04518c dashboard link: https://syzkaller.appspot.com/bug?extid=c966a82db0b14aa37e81 compiler: gcc (GCC) 8.0.1 20180413 (experimental) Unfortunately, I don't have any reproducer for this crash yet. IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+c966a82db0b14aa37e81@syzkaller.appspotmail.com loop7: rw=12288, want=8200, limit=20 netlink: 65342 bytes leftover after parsing attributes in process `syz-executor4'. openvswitch: netlink: Message has 8 unknown bytes. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN CPU: 1 PID: 7615 Comm: syz-executor7 Not tainted 4.18.0-rc7-next-20180801+ #29 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:188 [inline] RIP: 0010:compound_head include/linux/page-flags.h:142 [inline] RIP: 0010:PageLocked include/linux/page-flags.h:272 [inline] RIP: 0010:f2fs_put_page fs/f2fs/f2fs.h:2011 [inline] RIP: 0010:validate_checkpoint+0x66d/0xec0 fs/f2fs/checkpoint.c:835 Code: e8 58 05 7f fe 4c 8d 6b 80 4d 8d 74 24 08 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 c6 04 02 00 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 f4 06 00 00 4c 89 ea 4d 8b 7c 24 08 48 b8 00 00 RSP: 0018:ffff8801937cebe8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff8801937cef30 RCX: ffffc90006035000 RDX: 0000000000000000 RSI: ffffffff82fd9658 RDI: 0000000000000005 RBP: ffff8801937cef58 R08: ffff8801ab254700 R09: fffff94000d9e026 R10: fffff94000d9e026 R11: ffffea0006cf0137 R12: fffffffffffffffb R13: ffff8801937ceeb0 R14: 0000000000000003 R15: ffff880193419b40 FS: 00007f36a61d5700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc04ff93000 CR3: 00000001d0562000 CR4: 00000000001426e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: f2fs_get_valid_checkpoint+0x436/0x1ec0 fs/f2fs/checkpoint.c:860 f2fs_fill_super+0x2d42/0x8110 fs/f2fs/super.c:2883 mount_bdev+0x314/0x3e0 fs/super.c:1344 f2fs_mount+0x3c/0x50 fs/f2fs/super.c:3133 legacy_get_tree+0x131/0x460 fs/fs_context.c:729 vfs_get_tree+0x1cb/0x5c0 fs/super.c:1743 do_new_mount fs/namespace.c:2603 [inline] do_mount+0x6f2/0x1e20 fs/namespace.c:2927 ksys_mount+0x12d/0x140 fs/namespace.c:3143 __do_sys_mount fs/namespace.c:3157 [inline] __se_sys_mount fs/namespace.c:3154 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3154 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x45943a Code: b8 a6 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 bd 8a fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 0f 83 9a 8a fb ff c3 66 0f 1f 84 00 00 00 00 00 RSP: 002b:00007f36a61d4a88 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 00007f36a61d4b30 RCX: 000000000045943a RDX: 00007f36a61d4ad0 RSI: 0000000020000100 RDI: 00007f36a61d4af0 RBP: 0000000020000100 R08: 00007f36a61d4b30 R09: 00007f36a61d4ad0 R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000013 R13: 0000000000000000 R14: 00000000004c8ea0 R15: 0000000000000000 Modules linked in: Dumping ftrace buffer: (ftrace buffer empty) ---[ end trace bd8550c129352286 ]--- RIP: 0010:__read_once_size include/linux/compiler.h:188 [inline] RIP: 0010:compound_head include/linux/page-flags.h:142 [inline] RIP: 0010:PageLocked include/linux/page-flags.h:272 [inline] RIP: 0010:f2fs_put_page fs/f2fs/f2fs.h:2011 [inline] RIP: 0010:validate_checkpoint+0x66d/0xec0 fs/f2fs/checkpoint.c:835 Code: e8 58 05 7f fe 4c 8d 6b 80 4d 8d 74 24 08 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 c6 04 02 00 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 f4 06 00 00 4c 89 ea 4d 8b 7c 24 08 48 b8 00 00 RSP: 0018:ffff8801937cebe8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff8801937cef30 RCX: ffffc90006035000 RDX: 0000000000000000 RSI: ffffffff82fd9658 RDI: 0000000000000005 netlink: 65342 bytes leftover after parsing attributes in process `syz-executor4'. RBP: ffff8801937cef58 R08: ffff8801ab254700 R09: fffff94000d9e026 openvswitch: netlink: Message has 8 unknown bytes. R10: fffff94000d9e026 R11: ffffea0006cf0137 R12: fffffffffffffffb R13: ffff8801937ceeb0 R14: 0000000000000003 R15: ffff880193419b40 FS: 00007f36a61d5700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc04ff93000 CR3: 00000001d0562000 CR4: 00000000001426e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 In validate_checkpoint(), if we failed to call get_checkpoint_version(), we will pass returned invalid page pointer into f2fs_put_page, cause accessing invalid memory, this patch tries to handle error path correctly to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 551c1d1984ec..31561026ac9a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -780,6 +780,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, crc_offset = le32_to_cpu((*cp_block)->checksum_offset); if (crc_offset > (blk_size - sizeof(__le32))) { + f2fs_put_page(*cp_page, 1); f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; @@ -787,6 +788,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_put_page(*cp_page, 1); f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; } @@ -806,14 +808,14 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_1, version); if (err) - goto invalid_cp1; + return NULL; if (le32_to_cpu(cp_block->cp_pack_total_block_count) > sbi->blocks_per_seg) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); - goto invalid_cp1; + goto invalid_cp; } pre_version = *version; @@ -821,7 +823,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) - goto invalid_cp2; + goto invalid_cp; cur_version = *version; if (cur_version == pre_version) { @@ -829,9 +831,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, f2fs_put_page(cp_page_2, 1); return cp_page_1; } -invalid_cp2: f2fs_put_page(cp_page_2, 1); -invalid_cp1: +invalid_cp: f2fs_put_page(cp_page_1, 1); return NULL; } -- cgit v1.2.3-59-g8ed1b From 3093336481565248ce47557445977eeac21bfca8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Jul 2018 18:37:58 +0800 Subject: f2fs: fix to reset i_gc_failures correctly Let's reset i_gc_failures to zero when we unset pinned state for file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a90b4f24aa28..67c9c2d4e2d9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2810,7 +2810,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); - F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = 1; + f2fs_i_gc_failures_write(inode, 0); goto done; } -- cgit v1.2.3-59-g8ed1b From bcbfbd604dcba45246dd307c8317f216ada5567d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Jun 2018 00:19:25 +0800 Subject: f2fs: fix to do sanity check with inline flags https://bugzilla.kernel.org/show_bug.cgi?id=200221 - Overview BUG() in clear_inode() when mounting and un-mounting a corrupted f2fs image - Reproduce - Kernel message [ 538.601448] F2FS-fs (loop0): Invalid segment/section count (31, 24 x 1376257) [ 538.601458] F2FS-fs (loop0): Can't find valid F2FS filesystem in 2th superblock [ 538.724091] F2FS-fs (loop0): Try to recover 2th superblock, ret: 0 [ 538.724102] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 540.970834] ------------[ cut here ]------------ [ 540.970838] kernel BUG at fs/inode.c:512! [ 540.971750] invalid opcode: 0000 [#1] SMP KASAN PTI [ 540.972755] CPU: 1 PID: 1305 Comm: umount Not tainted 4.18.0-rc1+ #4 [ 540.974034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 540.982913] RIP: 0010:clear_inode+0xc0/0xd0 [ 540.983774] Code: 8d a3 30 01 00 00 4c 89 e7 e8 1c ec f8 ff 48 8b 83 30 01 00 00 49 39 c4 75 1a 48 c7 83 a0 00 00 00 60 00 00 00 5b 41 5c 5d c3 <0f> 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 1f 40 00 66 66 66 66 90 55 [ 540.987570] RSP: 0018:ffff8801e34a7b70 EFLAGS: 00010002 [ 540.988636] RAX: 0000000000000000 RBX: ffff8801e9b744e8 RCX: ffffffffb840eb3a [ 540.990063] RDX: dffffc0000000000 RSI: 0000000000000004 RDI: ffff8801e9b746b8 [ 540.991499] RBP: ffff8801e34a7b80 R08: ffffed003d36e8ce R09: ffffed003d36e8ce [ 540.992923] R10: 0000000000000001 R11: ffffed003d36e8cd R12: ffff8801e9b74668 [ 540.994360] R13: ffff8801e9b74760 R14: ffff8801e9b74528 R15: ffff8801e9b74530 [ 540.995786] FS: 00007f4662bdf840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 540.997403] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 540.998571] CR2: 000000000175c568 CR3: 00000001dcfe6000 CR4: 00000000000006e0 [ 541.000015] Call Trace: [ 541.000554] f2fs_evict_inode+0x253/0x630 [ 541.001381] evict+0x16f/0x290 [ 541.002015] iput+0x280/0x300 [ 541.002654] dentry_unlink_inode+0x165/0x1e0 [ 541.003528] __dentry_kill+0x16a/0x260 [ 541.004300] dentry_kill+0x70/0x250 [ 541.005018] dput+0x154/0x1d0 [ 541.005635] do_one_tree+0x34/0x40 [ 541.006354] shrink_dcache_for_umount+0x3f/0xa0 [ 541.007285] generic_shutdown_super+0x43/0x1c0 [ 541.008192] kill_block_super+0x52/0x80 [ 541.008978] kill_f2fs_super+0x62/0x70 [ 541.009750] deactivate_locked_super+0x6f/0xa0 [ 541.010664] deactivate_super+0x5e/0x80 [ 541.011450] cleanup_mnt+0x61/0xa0 [ 541.012151] __cleanup_mnt+0x12/0x20 [ 541.012893] task_work_run+0xc8/0xf0 [ 541.013635] exit_to_usermode_loop+0x125/0x130 [ 541.014555] do_syscall_64+0x138/0x170 [ 541.015340] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 541.016375] RIP: 0033:0x7f46624bf487 [ 541.017104] Code: 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 c9 2b 00 f7 d8 64 89 01 48 [ 541.020923] RSP: 002b:00007fff5e12e9a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 541.022452] RAX: 0000000000000000 RBX: 0000000001753030 RCX: 00007f46624bf487 [ 541.023885] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000000000175a1e0 [ 541.025318] RBP: 000000000175a1e0 R08: 0000000000000000 R09: 0000000000000014 [ 541.026755] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f46629c883c [ 541.028186] R13: 0000000000000000 R14: 0000000001753210 R15: 00007fff5e12ec30 [ 541.029626] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy [ 541.039445] ---[ end trace 4ce02f25ff7d3df5 ]--- [ 541.040392] RIP: 0010:clear_inode+0xc0/0xd0 [ 541.041240] Code: 8d a3 30 01 00 00 4c 89 e7 e8 1c ec f8 ff 48 8b 83 30 01 00 00 49 39 c4 75 1a 48 c7 83 a0 00 00 00 60 00 00 00 5b 41 5c 5d c3 <0f> 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 0b 0f 1f 40 00 66 66 66 66 90 55 [ 541.045042] RSP: 0018:ffff8801e34a7b70 EFLAGS: 00010002 [ 541.046099] RAX: 0000000000000000 RBX: ffff8801e9b744e8 RCX: ffffffffb840eb3a [ 541.047537] RDX: dffffc0000000000 RSI: 0000000000000004 RDI: ffff8801e9b746b8 [ 541.048965] RBP: ffff8801e34a7b80 R08: ffffed003d36e8ce R09: ffffed003d36e8ce [ 541.050402] R10: 0000000000000001 R11: ffffed003d36e8cd R12: ffff8801e9b74668 [ 541.051832] R13: ffff8801e9b74760 R14: ffff8801e9b74528 R15: ffff8801e9b74530 [ 541.053263] FS: 00007f4662bdf840(0000) GS:ffff8801f6f00000(0000) knlGS:0000000000000000 [ 541.054891] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 541.056039] CR2: 000000000175c568 CR3: 00000001dcfe6000 CR4: 00000000000006e0 [ 541.058506] ================================================================== [ 541.059991] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0 [ 541.061513] Read of size 8 at addr ffff8801e34a7970 by task umount/1305 [ 541.063302] CPU: 1 PID: 1305 Comm: umount Tainted: G D 4.18.0-rc1+ #4 [ 541.064838] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 541.066778] Call Trace: [ 541.067294] dump_stack+0x7b/0xb5 [ 541.067986] print_address_description+0x70/0x290 [ 541.068941] kasan_report+0x291/0x390 [ 541.069692] ? update_stack_state+0x38c/0x3e0 [ 541.070598] __asan_load8+0x54/0x90 [ 541.071315] update_stack_state+0x38c/0x3e0 [ 541.072172] ? __read_once_size_nocheck.constprop.7+0x20/0x20 [ 541.073340] ? vprintk_func+0x27/0x60 [ 541.074096] ? printk+0xa3/0xd3 [ 541.074762] ? __save_stack_trace+0x5e/0x100 [ 541.075634] unwind_next_frame.part.5+0x18e/0x490 [ 541.076594] ? unwind_dump+0x290/0x290 [ 541.077368] ? __show_regs+0x2c4/0x330 [ 541.078142] __unwind_start+0x106/0x190 [ 541.085422] __save_stack_trace+0x5e/0x100 [ 541.086268] ? __save_stack_trace+0x5e/0x100 [ 541.087161] ? unlink_anon_vmas+0xba/0x2c0 [ 541.087997] save_stack_trace+0x1f/0x30 [ 541.088782] save_stack+0x46/0xd0 [ 541.089475] ? __alloc_pages_slowpath+0x1420/0x1420 [ 541.090477] ? flush_tlb_mm_range+0x15e/0x220 [ 541.091364] ? __dec_node_state+0x24/0xb0 [ 541.092180] ? lock_page_memcg+0x85/0xf0 [ 541.092979] ? unlock_page_memcg+0x16/0x80 [ 541.093812] ? page_remove_rmap+0x198/0x520 [ 541.094674] ? mark_page_accessed+0x133/0x200 [ 541.095559] ? _cond_resched+0x1a/0x50 [ 541.096326] ? unmap_page_range+0xcd4/0xe50 [ 541.097179] ? rb_next+0x58/0x80 [ 541.097845] ? rb_next+0x58/0x80 [ 541.098518] __kasan_slab_free+0x13c/0x1a0 [ 541.099352] ? unlink_anon_vmas+0xba/0x2c0 [ 541.100184] kasan_slab_free+0xe/0x10 [ 541.100934] kmem_cache_free+0x89/0x1e0 [ 541.101724] unlink_anon_vmas+0xba/0x2c0 [ 541.102534] free_pgtables+0x101/0x1b0 [ 541.103299] exit_mmap+0x146/0x2a0 [ 541.103996] ? __ia32_sys_munmap+0x50/0x50 [ 541.104829] ? kasan_check_read+0x11/0x20 [ 541.105649] ? mm_update_next_owner+0x322/0x380 [ 541.106578] mmput+0x8b/0x1d0 [ 541.107191] do_exit+0x43a/0x1390 [ 541.107876] ? mm_update_next_owner+0x380/0x380 [ 541.108791] ? deactivate_super+0x5e/0x80 [ 541.109610] ? cleanup_mnt+0x61/0xa0 [ 541.110351] ? __cleanup_mnt+0x12/0x20 [ 541.111115] ? task_work_run+0xc8/0xf0 [ 541.111879] ? exit_to_usermode_loop+0x125/0x130 [ 541.112817] rewind_stack_do_exit+0x17/0x20 [ 541.113666] RIP: 0033:0x7f46624bf487 [ 541.114404] Code: Bad RIP value. [ 541.115094] RSP: 002b:00007fff5e12e9a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 541.116605] RAX: 0000000000000000 RBX: 0000000001753030 RCX: 00007f46624bf487 [ 541.118034] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000000000175a1e0 [ 541.119472] RBP: 000000000175a1e0 R08: 0000000000000000 R09: 0000000000000014 [ 541.120890] R10: 00000000000006b2 R11: 0000000000000246 R12: 00007f46629c883c [ 541.122321] R13: 0000000000000000 R14: 0000000001753210 R15: 00007fff5e12ec30 [ 541.124061] The buggy address belongs to the page: [ 541.125042] page:ffffea00078d29c0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 541.126651] flags: 0x2ffff0000000000() [ 541.127418] raw: 02ffff0000000000 dead000000000100 dead000000000200 0000000000000000 [ 541.128963] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 541.130516] page dumped because: kasan: bad access detected [ 541.131954] Memory state around the buggy address: [ 541.132924] ffff8801e34a7800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00 [ 541.134378] ffff8801e34a7880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 541.135814] >ffff8801e34a7900: 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 [ 541.137253] ^ [ 541.138637] ffff8801e34a7980: f1 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 541.140075] ffff8801e34a7a00: 00 00 00 00 00 00 00 00 f3 00 00 00 00 00 00 00 [ 541.141509] ================================================================== - Location https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/inode.c#L512 BUG_ON(inode->i_data.nrpages); The root cause is root directory inode is corrupted, it has both inline_data and inline_dentry flag, and its nlink is zero, so in ->evict(), after dropping all page cache, it grabs page #0 for inline data truncation, result in panic in later clear_inode() where we will check inode->i_data.nrpages value. This patch adds inline flags check in sanity_check_inode, in addition, do sanity check with root inode's nlink. Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 20 ++++++++++++++++++++ fs/f2fs/super.c | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 35d49528b2c1..aa343a5cab44 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -265,6 +265,26 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } } + + if (f2fs_has_inline_data(inode) && + (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_data, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_dentry, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + return true; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b0e2b017f390..bd57be470e23 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2963,7 +2963,8 @@ try_onemore: err = PTR_ERR(root); goto free_stats; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + if (!S_ISDIR(root->i_mode) || !root->i_blocks || + !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; goto free_stats; -- cgit v1.2.3-59-g8ed1b From 91291e9998d208370eb8156c760691b873bd7522 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 10 Jul 2018 23:01:45 +0800 Subject: f2fs: fix to do sanity check with block address in main area v2 This patch adds f2fs_is_valid_blkaddr() in below functions to do sanity check with block address to avoid pentential panic: - f2fs_grab_read_bio() - __written_first_block() https://bugzilla.kernel.org/show_bug.cgi?id=200465 - Reproduce - POC (poc.c) #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void activity(char *mpoint) { char *xattr; int err; err = asprintf(&xattr, "%s/foo/bar/xattr", mpoint); char buf2[113]; memset(buf2, 0, sizeof(buf2)); listxattr(xattr, buf2, sizeof(buf2)); } int main(int argc, char *argv[]) { activity(argv[1]); return 0; } - kernel message [ 844.718738] F2FS-fs (loop0): Mounted with checkpoint version = 2 [ 846.430929] F2FS-fs (loop0): access invalid blkaddr:1024 [ 846.431058] WARNING: CPU: 1 PID: 1249 at fs/f2fs/checkpoint.c:154 f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.431059] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.431310] CPU: 1 PID: 1249 Comm: a.out Not tainted 4.18.0-rc3+ #1 [ 846.431312] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.431315] RIP: 0010:f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.431316] Code: 00 eb ed 31 c0 83 fa 05 75 ae 48 83 ec 08 48 8b 3f 89 f1 48 c7 c2 fc 0b 0f 8b 48 c7 c6 8b d7 09 8b 88 44 24 07 e8 61 8b ff ff <0f> 0b 0f b6 44 24 07 48 83 c4 08 eb 81 4c 8b 47 10 8b 8f 38 04 00 [ 846.431347] RSP: 0018:ffff961c414a7bc0 EFLAGS: 00010282 [ 846.431349] RAX: 0000000000000000 RBX: ffffc5f787b8ea80 RCX: 0000000000000000 [ 846.431350] RDX: 0000000000000000 RSI: ffff89dfffd165d8 RDI: ffff89dfffd165d8 [ 846.431351] RBP: ffff961c414a7c20 R08: 0000000000000001 R09: 0000000000000248 [ 846.431353] R10: 0000000000000000 R11: 0000000000000248 R12: 0000000000000007 [ 846.431369] R13: ffff89dff5492800 R14: ffff89dfae3aa000 R15: ffff89dff4ff88d0 [ 846.431372] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.431373] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.431374] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.431384] Call Trace: [ 846.431426] f2fs_iget+0x6f4/0xe70 [ 846.431430] ? f2fs_find_entry+0x71/0x90 [ 846.431432] f2fs_lookup+0x1aa/0x390 [ 846.431452] __lookup_slow+0x97/0x150 [ 846.431459] lookup_slow+0x35/0x50 [ 846.431462] walk_component+0x1c6/0x470 [ 846.431479] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.431488] ? page_add_file_rmap+0x13/0x200 [ 846.431491] path_lookupat+0x76/0x230 [ 846.431501] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.431504] filename_lookup+0xb8/0x1a0 [ 846.431534] ? _cond_resched+0x16/0x40 [ 846.431541] ? kmem_cache_alloc+0x160/0x1d0 [ 846.431549] ? path_listxattr+0x41/0xa0 [ 846.431551] path_listxattr+0x41/0xa0 [ 846.431570] do_syscall_64+0x55/0x100 [ 846.431583] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.431607] RIP: 0033:0x7f882de1c0d7 [ 846.431607] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.431639] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.431641] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.431642] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.431643] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.431645] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.431646] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.431648] ---[ end trace abca54df39d14f5c ]--- [ 846.431651] F2FS-fs (loop0): invalid blkaddr: 1024, type: 5, run fsck to fix. [ 846.431762] WARNING: CPU: 1 PID: 1249 at fs/f2fs/f2fs.h:2697 f2fs_iget+0xd17/0xe70 [ 846.431763] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.431797] CPU: 1 PID: 1249 Comm: a.out Tainted: G W 4.18.0-rc3+ #1 [ 846.431798] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.431800] RIP: 0010:f2fs_iget+0xd17/0xe70 [ 846.431801] Code: ff ff 48 63 d8 e9 e1 f6 ff ff 48 8b 45 c8 41 b8 05 00 00 00 48 c7 c2 d8 e8 0e 8b 48 c7 c6 1d b0 0a 8b 48 8b 38 e8 f9 b4 00 00 <0f> 0b 48 8b 45 c8 f0 80 48 48 04 e9 d8 f9 ff ff 0f 0b 48 8b 43 18 [ 846.431832] RSP: 0018:ffff961c414a7bd0 EFLAGS: 00010282 [ 846.431834] RAX: 0000000000000000 RBX: ffffc5f787b8ea80 RCX: 0000000000000006 [ 846.431835] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff89dfffd165d0 [ 846.431836] RBP: ffff961c414a7c20 R08: 0000000000000000 R09: 0000000000000273 [ 846.431837] R10: 0000000000000000 R11: ffff89dfad50ca60 R12: 0000000000000007 [ 846.431838] R13: ffff89dff5492800 R14: ffff89dfae3aa000 R15: ffff89dff4ff88d0 [ 846.431840] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.431841] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.431842] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.431846] Call Trace: [ 846.431850] ? f2fs_find_entry+0x71/0x90 [ 846.431853] f2fs_lookup+0x1aa/0x390 [ 846.431856] __lookup_slow+0x97/0x150 [ 846.431858] lookup_slow+0x35/0x50 [ 846.431874] walk_component+0x1c6/0x470 [ 846.431878] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.431880] ? page_add_file_rmap+0x13/0x200 [ 846.431882] path_lookupat+0x76/0x230 [ 846.431884] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.431886] filename_lookup+0xb8/0x1a0 [ 846.431890] ? _cond_resched+0x16/0x40 [ 846.431891] ? kmem_cache_alloc+0x160/0x1d0 [ 846.431894] ? path_listxattr+0x41/0xa0 [ 846.431896] path_listxattr+0x41/0xa0 [ 846.431898] do_syscall_64+0x55/0x100 [ 846.431901] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.431902] RIP: 0033:0x7f882de1c0d7 [ 846.431903] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.431934] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.431936] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.431937] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.431939] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.431940] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.431941] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.431943] ---[ end trace abca54df39d14f5d ]--- [ 846.432033] F2FS-fs (loop0): access invalid blkaddr:1024 [ 846.432051] WARNING: CPU: 1 PID: 1249 at fs/f2fs/checkpoint.c:154 f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.432051] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.432085] CPU: 1 PID: 1249 Comm: a.out Tainted: G W 4.18.0-rc3+ #1 [ 846.432086] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.432089] RIP: 0010:f2fs_is_valid_blkaddr+0x10f/0x160 [ 846.432089] Code: 00 eb ed 31 c0 83 fa 05 75 ae 48 83 ec 08 48 8b 3f 89 f1 48 c7 c2 fc 0b 0f 8b 48 c7 c6 8b d7 09 8b 88 44 24 07 e8 61 8b ff ff <0f> 0b 0f b6 44 24 07 48 83 c4 08 eb 81 4c 8b 47 10 8b 8f 38 04 00 [ 846.432120] RSP: 0018:ffff961c414a7900 EFLAGS: 00010286 [ 846.432122] RAX: 0000000000000000 RBX: 0000000000000400 RCX: 0000000000000006 [ 846.432123] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffff89dfffd165d0 [ 846.432124] RBP: ffff89dff5492800 R08: 0000000000000001 R09: 000000000000029d [ 846.432125] R10: ffff961c414a7820 R11: 000000000000029d R12: 0000000000000400 [ 846.432126] R13: 0000000000000000 R14: ffff89dff4ff88d0 R15: 0000000000000000 [ 846.432128] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.432130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.432131] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.432135] Call Trace: [ 846.432151] f2fs_wait_on_block_writeback+0x20/0x110 [ 846.432158] f2fs_grab_read_bio+0xbc/0xe0 [ 846.432161] f2fs_submit_page_read+0x21/0x280 [ 846.432163] f2fs_get_read_data_page+0xb7/0x3c0 [ 846.432165] f2fs_get_lock_data_page+0x29/0x1e0 [ 846.432167] f2fs_get_new_data_page+0x148/0x550 [ 846.432170] f2fs_add_regular_entry+0x1d2/0x550 [ 846.432178] ? __switch_to+0x12f/0x460 [ 846.432181] f2fs_add_dentry+0x6a/0xd0 [ 846.432184] f2fs_do_add_link+0xe9/0x140 [ 846.432186] __recover_dot_dentries+0x260/0x280 [ 846.432189] f2fs_lookup+0x343/0x390 [ 846.432193] __lookup_slow+0x97/0x150 [ 846.432195] lookup_slow+0x35/0x50 [ 846.432208] walk_component+0x1c6/0x470 [ 846.432212] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.432215] ? page_add_file_rmap+0x13/0x200 [ 846.432217] path_lookupat+0x76/0x230 [ 846.432219] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.432221] filename_lookup+0xb8/0x1a0 [ 846.432224] ? _cond_resched+0x16/0x40 [ 846.432226] ? kmem_cache_alloc+0x160/0x1d0 [ 846.432228] ? path_listxattr+0x41/0xa0 [ 846.432230] path_listxattr+0x41/0xa0 [ 846.432233] do_syscall_64+0x55/0x100 [ 846.432235] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.432237] RIP: 0033:0x7f882de1c0d7 [ 846.432237] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.432269] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.432271] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.432272] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.432273] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.432274] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.432275] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.432277] ---[ end trace abca54df39d14f5e ]--- [ 846.432279] F2FS-fs (loop0): invalid blkaddr: 1024, type: 5, run fsck to fix. [ 846.432376] WARNING: CPU: 1 PID: 1249 at fs/f2fs/f2fs.h:2697 f2fs_wait_on_block_writeback+0xb1/0x110 [ 846.432376] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.432410] CPU: 1 PID: 1249 Comm: a.out Tainted: G W 4.18.0-rc3+ #1 [ 846.432411] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.432413] RIP: 0010:f2fs_wait_on_block_writeback+0xb1/0x110 [ 846.432414] Code: 66 90 f0 ff 4b 34 74 59 5b 5d c3 48 8b 7d 00 41 b8 05 00 00 00 89 d9 48 c7 c2 d8 e8 0e 8b 48 c7 c6 1d b0 0a 8b e8 df bc fd ff <0f> 0b f0 80 4d 48 04 e9 67 ff ff ff 48 8b 03 48 c1 e8 37 83 e0 07 [ 846.432445] RSP: 0018:ffff961c414a7910 EFLAGS: 00010286 [ 846.432447] RAX: 0000000000000000 RBX: 0000000000000400 RCX: 0000000000000006 [ 846.432448] RDX: 0000000000000000 RSI: 0000000000000092 RDI: ffff89dfffd165d0 [ 846.432449] RBP: ffff89dff5492800 R08: 0000000000000000 R09: 00000000000002d1 [ 846.432450] R10: ffff961c414a7820 R11: ffff89dfad50cf80 R12: 0000000000000400 [ 846.432451] R13: 0000000000000000 R14: ffff89dff4ff88d0 R15: 0000000000000000 [ 846.432453] FS: 00007f882e2fb700(0000) GS:ffff89dfffd00000(0000) knlGS:0000000000000000 [ 846.432454] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.432455] CR2: 0000000001a88008 CR3: 00000001eb572000 CR4: 00000000000006e0 [ 846.432459] Call Trace: [ 846.432463] f2fs_grab_read_bio+0xbc/0xe0 [ 846.432464] f2fs_submit_page_read+0x21/0x280 [ 846.432466] f2fs_get_read_data_page+0xb7/0x3c0 [ 846.432468] f2fs_get_lock_data_page+0x29/0x1e0 [ 846.432470] f2fs_get_new_data_page+0x148/0x550 [ 846.432473] f2fs_add_regular_entry+0x1d2/0x550 [ 846.432475] ? __switch_to+0x12f/0x460 [ 846.432477] f2fs_add_dentry+0x6a/0xd0 [ 846.432480] f2fs_do_add_link+0xe9/0x140 [ 846.432483] __recover_dot_dentries+0x260/0x280 [ 846.432485] f2fs_lookup+0x343/0x390 [ 846.432488] __lookup_slow+0x97/0x150 [ 846.432490] lookup_slow+0x35/0x50 [ 846.432505] walk_component+0x1c6/0x470 [ 846.432509] ? memcg_kmem_charge_memcg+0x70/0x90 [ 846.432511] ? page_add_file_rmap+0x13/0x200 [ 846.432513] path_lookupat+0x76/0x230 [ 846.432515] ? __alloc_pages_nodemask+0xfc/0x280 [ 846.432517] filename_lookup+0xb8/0x1a0 [ 846.432520] ? _cond_resched+0x16/0x40 [ 846.432522] ? kmem_cache_alloc+0x160/0x1d0 [ 846.432525] ? path_listxattr+0x41/0xa0 [ 846.432526] path_listxattr+0x41/0xa0 [ 846.432529] do_syscall_64+0x55/0x100 [ 846.432531] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 846.432533] RIP: 0033:0x7f882de1c0d7 [ 846.432533] Code: f0 ff ff 73 01 c3 48 8b 0d be dd 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 c2 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 91 dd 2b 00 f7 d8 64 89 01 48 [ 846.432565] RSP: 002b:00007ffe8e66c238 EFLAGS: 00000202 ORIG_RAX: 00000000000000c2 [ 846.432567] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f882de1c0d7 [ 846.432568] RDX: 0000000000000071 RSI: 00007ffe8e66c280 RDI: 0000000001a880c0 [ 846.432569] RBP: 00007ffe8e66c300 R08: 0000000001a88010 R09: 0000000000000000 [ 846.432570] R10: 00000000000001ab R11: 0000000000000202 R12: 0000000000400550 [ 846.432571] R13: 00007ffe8e66c400 R14: 0000000000000000 R15: 0000000000000000 [ 846.432573] ---[ end trace abca54df39d14f5f ]--- [ 846.434280] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 846.434424] PGD 80000001ebd3a067 P4D 80000001ebd3a067 PUD 1eb1ae067 PMD 0 [ 846.434551] Oops: 0000 [#1] SMP PTI [ 846.434697] CPU: 0 PID: 44 Comm: kworker/u5:0 Tainted: G W 4.18.0-rc3+ #1 [ 846.434805] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 846.435000] Workqueue: fscrypt_read_queue decrypt_work [ 846.435174] RIP: 0010:fscrypt_do_page_crypto+0x6e/0x2d0 [ 846.435351] Code: 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 88 00 00 00 31 c0 e8 43 c2 e0 ff 49 8b 86 48 02 00 00 85 ed c7 44 24 70 00 00 00 00 <48> 8b 58 08 0f 84 14 02 00 00 48 8b 78 10 48 8b 0c 24 48 c7 84 24 [ 846.435696] RSP: 0018:ffff961c40f9bd60 EFLAGS: 00010206 [ 846.435870] RAX: 0000000000000000 RBX: ffffc5f787719b80 RCX: ffffc5f787719b80 [ 846.436051] RDX: ffffffff8b9f4b88 RSI: ffffffff8b0ae622 RDI: ffff961c40f9bdb8 [ 846.436261] RBP: 0000000000001000 R08: ffffc5f787719b80 R09: 0000000000001000 [ 846.436433] R10: 0000000000000018 R11: fefefefefefefeff R12: ffffc5f787719b80 [ 846.436562] R13: ffffc5f787719b80 R14: ffff89dff4ff88d0 R15: 0ffff89dfaddee60 [ 846.436658] FS: 0000000000000000(0000) GS:ffff89dfffc00000(0000) knlGS:0000000000000000 [ 846.436758] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.436898] CR2: 0000000000000008 CR3: 00000001eddd0000 CR4: 00000000000006f0 [ 846.437001] Call Trace: [ 846.437181] ? check_preempt_wakeup+0xf2/0x230 [ 846.437276] ? check_preempt_curr+0x7c/0x90 [ 846.437370] fscrypt_decrypt_page+0x48/0x4d [ 846.437466] __fscrypt_decrypt_bio+0x5b/0x90 [ 846.437542] decrypt_work+0x12/0x20 [ 846.437651] process_one_work+0x15e/0x3d0 [ 846.437740] worker_thread+0x4c/0x440 [ 846.437848] kthread+0xf8/0x130 [ 846.437938] ? rescuer_thread+0x350/0x350 [ 846.438022] ? kthread_associate_blkcg+0x90/0x90 [ 846.438117] ret_from_fork+0x35/0x40 [ 846.438201] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_timer snd input_leds joydev soundcore serio_raw i2c_piix4 mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear qxl ttm crct10dif_pclmul crc32_pclmul drm_kms_helper ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops pcbc drm 8139too aesni_intel 8139cp floppy psmouse mii aes_x86_64 crypto_simd pata_acpi cryptd glue_helper [ 846.438653] CR2: 0000000000000008 [ 846.438713] ---[ end trace abca54df39d14f60 ]--- [ 846.438796] RIP: 0010:fscrypt_do_page_crypto+0x6e/0x2d0 [ 846.438844] Code: 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 88 00 00 00 31 c0 e8 43 c2 e0 ff 49 8b 86 48 02 00 00 85 ed c7 44 24 70 00 00 00 00 <48> 8b 58 08 0f 84 14 02 00 00 48 8b 78 10 48 8b 0c 24 48 c7 84 24 [ 846.439084] RSP: 0018:ffff961c40f9bd60 EFLAGS: 00010206 [ 846.439176] RAX: 0000000000000000 RBX: ffffc5f787719b80 RCX: ffffc5f787719b80 [ 846.440927] RDX: ffffffff8b9f4b88 RSI: ffffffff8b0ae622 RDI: ffff961c40f9bdb8 [ 846.442083] RBP: 0000000000001000 R08: ffffc5f787719b80 R09: 0000000000001000 [ 846.443284] R10: 0000000000000018 R11: fefefefefefefeff R12: ffffc5f787719b80 [ 846.444448] R13: ffffc5f787719b80 R14: ffff89dff4ff88d0 R15: 0ffff89dfaddee60 [ 846.445558] FS: 0000000000000000(0000) GS:ffff89dfffc00000(0000) knlGS:0000000000000000 [ 846.446687] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 846.447796] CR2: 0000000000000008 CR3: 00000001eddd0000 CR4: 00000000000006f0 - Location https://elixir.bootlin.com/linux/v4.18-rc4/source/fs/crypto/crypto.c#L149 struct crypto_skcipher *tfm = ci->ci_ctfm; Here ci can be NULL Note that this issue maybe require CONFIG_F2FS_FS_ENCRYPTION=y to reproduce. Reported-by Wen Xu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ fs/f2fs/inode.c | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 363520ee099a..1e6cc68fb7c4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -546,6 +546,9 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + return ERR_PTR(-EFAULT); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index aa343a5cab44..fc2c98b9e255 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -68,14 +68,16 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } -static bool __written_first_block(struct f2fs_sb_info *sbi, +static int __written_first_block(struct f2fs_sb_info *sbi, struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (is_valid_data_blkaddr(sbi, addr)) - return true; - return false; + if (!__is_valid_data_blkaddr(addr)) + return 1; + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC)) + return -EFAULT; + return 0; } static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -295,6 +297,7 @@ static int do_read_inode(struct inode *inode) struct page *node_page; struct f2fs_inode *ri; projid_t i_projid; + int err; /* Check if ino is within scope */ if (f2fs_check_nid_range(sbi, inode->i_ino)) @@ -368,7 +371,12 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); - if (__written_first_block(sbi, ri)) + err = __written_first_block(sbi, ri); + if (err < 0) { + f2fs_put_page(node_page, 1); + return err; + } + if (!err) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) -- cgit v1.2.3-59-g8ed1b From c7079853c859c910b9d047a37891b4aafb8f8dd7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:02:22 +0800 Subject: f2fs: avoid race between zero_range and background GC Thread A Background GC - f2fs_zero_range - truncate_pagecache_range - gc_data_segment - get_read_data_page - move_data_page - set_page_dirty - set_cold_data - f2fs_do_zero_range - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr Actually, we don't need to set dirty & checked flag on the page, since all valid data in the page should be zeroed by zero_range(). Use i_gc_rwsem[WRITE] to avoid such race condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 67c9c2d4e2d9..4cb02027827e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1314,8 +1314,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) goto out_sem; - truncate_pagecache_range(inode, offset, offset + len - 1); - pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1345,12 +1343,19 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, unsigned int end_offset; pgoff_t end; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + truncate_pagecache_range(inode, + (loff_t)index << PAGE_SHIFT, + ((loff_t)pg_end << PAGE_SHIFT) - 1); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1359,7 +1364,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); -- cgit v1.2.3-59-g8ed1b From a33c150237a20d97a174243bc658c86502f9d370 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:04:25 +0800 Subject: f2fs: fix avoid race between truncate and background GC Thread A Background GC - f2fs_setattr isize to 0 - truncate_setsize - gc_data_segment - f2fs_get_read_data_page page #0 - set_page_dirty - set_cold_data - f2fs_truncate - f2fs_setattr isize to 4k - read 4k <--- hit data in cached page #0 Above race condition can cause read out invalid data in a truncated page, fix it by i_gc_rwsem[WRITE] lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ fs/f2fs/file.c | 37 +++++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 14 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1e6cc68fb7c4..45f043ee48bd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2199,8 +2199,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) if (to > i_size) { down_write(&F2FS_I(inode)->i_mmap_sem); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); } } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4cb02027827e..8c4694b9af27 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -796,22 +796,26 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size <= i_size_read(inode)) { - down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_setsize(inode, attr->ia_size); + bool to_smaller = (attr->ia_size <= i_size_read(inode)); + + down_write(&F2FS_I(inode)->i_mmap_sem); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + truncate_setsize(inode, attr->ia_size); + + if (to_smaller) err = f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); - if (err) - return err; - } else { - /* - * do not trim all blocks after i_size if target size is - * larger than i_size. - */ - down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_setsize(inode, attr->ia_size); - up_write(&F2FS_I(inode)->i_mmap_sem); + /* + * do not trim all blocks after i_size if target size is + * larger than i_size. + */ + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_mmap_sem); + if (err) + return err; + + if (!to_smaller) { /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -958,13 +962,18 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); } } -- cgit v1.2.3-59-g8ed1b From 22969158083c9e5c92f66718dde1a372baa1a49d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:08:59 +0800 Subject: f2fs: refresh recent accessed nat entry in lru list Introduce nat_list_lock to protect nm_i->nat_entries list, and manage it as a LRU list, refresh location for therein recent accessed entries in the list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 4 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1647a13be7f9..d9df58163f29 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -780,6 +780,7 @@ struct f2fs_nm_info { struct radix_tree_root nat_set_root;/* root of the nat set cache */ struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ + spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ unsigned int nat_blocks; /* # of nat blocks */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 81fb2f3edb52..472dd643b074 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -174,14 +174,30 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, if (raw_ne) node_info_from_raw_nat(&ne->ni, raw_ne); + + spin_lock(&nm_i->nat_list_lock); list_add_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + nm_i->nat_cnt++; return ne; } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { - return radix_tree_lookup(&nm_i->nat_root, n); + struct nat_entry *ne; + + ne = radix_tree_lookup(&nm_i->nat_root, n); + + /* for recent accessed nat entry, move it to tail of lru list */ + if (ne && !get_nat_flag(ne, IS_DIRTY)) { + spin_lock(&nm_i->nat_list_lock); + if (!list_empty(&ne->list)) + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + } + + return ne; } static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, @@ -192,7 +208,6 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { - list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); nm_i->nat_cnt--; __free_nat_entry(e); @@ -243,16 +258,21 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nm_i->dirty_nat_cnt++; set_nat_flag(ne, IS_DIRTY, true); refresh_list: + spin_lock(&nm_i->nat_list_lock); if (new_ne) list_del_init(&ne->list); else list_move_tail(&ne->list, &head->entry_list); + spin_unlock(&nm_i->nat_list_lock); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, struct nat_entry_set *set, struct nat_entry *ne) { + spin_lock(&nm_i->nat_list_lock); list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; nm_i->dirty_nat_cnt--; @@ -469,13 +489,25 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) if (!down_write_trylock(&nm_i->nat_tree_lock)) return 0; - while (nr_shrink && !list_empty(&nm_i->nat_entries)) { + spin_lock(&nm_i->nat_list_lock); + while (nr_shrink) { struct nat_entry *ne; + + if (list_empty(&nm_i->nat_entries)) + break; + ne = list_first_entry(&nm_i->nat_entries, struct nat_entry, list); + list_del(&ne->list); + spin_unlock(&nm_i->nat_list_lock); + __del_from_nat_cache(nm_i, ne); nr_shrink--; + + spin_lock(&nm_i->nat_list_lock); } + spin_unlock(&nm_i->nat_list_lock); + up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -2906,6 +2938,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); + spin_lock_init(&nm_i->nat_list_lock); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->nid_list_lock); @@ -3024,8 +3057,13 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) unsigned idx; nid = nat_get_nid(natvec[found - 1]) + 1; - for (idx = 0; idx < found; idx++) + for (idx = 0; idx < found; idx++) { + spin_lock(&nm_i->nat_list_lock); + list_del(&natvec[idx]->list); + spin_unlock(&nm_i->nat_list_lock); + __del_from_nat_cache(nm_i, natvec[idx]); + } } f2fs_bug_on(sbi, nm_i->nat_cnt); -- cgit v1.2.3-59-g8ed1b From 6eae269461b84406a1240189e6f81e5234b6393c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Aug 2018 23:09:00 +0800 Subject: f2fs: fix incorrect range->len in f2fs_trim_fs() generic/260 reported below error: [+] Default length with start set (should succeed) [+] Length beyond the end of fs (should succeed) [+] Length beyond the end of fs with start set (should succeed) +./tests/generic/260: line 94: [: 18446744073709551615: integer expression expected +./tests/generic/260: line 104: [: 18446744073709551615: integer expression expected Test done ... In f2fs_trim_fs(), if there is no discard being trimmed, we need to correct range->len before return. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3662e1f429b4..eada91dae08a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2582,8 +2582,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - range->len = F2FS_BLK_TO_BYTES(trimmed); out: + if (!err) + range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } -- cgit v1.2.3-59-g8ed1b From a690efffd16302d23b0bbd00f84dcedc11935a8b Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Sun, 5 Aug 2018 12:45:35 +0800 Subject: f2fs: wake up gc thread immediately when gc_urgent is set Fixes: 5b0e95398e2b ("f2fs: introduce sbi->gc_mode to determine the policy") Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f22782a0defe..cd2e030e47b8 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -253,6 +253,7 @@ out: if (t >= 1) { sbi->gc_mode = GC_URGENT; if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); -- cgit v1.2.3-59-g8ed1b From 35ec7d5748849762008e8ae9f8ad2766229d5794 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Aug 2018 22:43:50 +0800 Subject: f2fs: split discard command in prior to block layer Some devices has small max_{hw,}discard_sectors, so that in __blkdev_issue_discard(), one big size discard bio can be split into multiple small size discard bios, result in heavy load in IO scheduler and device, which can hang other sync IO for long time. Now, f2fs is trying to control discard commands more elaboratively, in order to make less conflict in between discard IO and user IO to enhance application's performance, so in this patch, we will split discard bio in f2fs in prior to in block layer to reduce issuing multiple discard bios in a short time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 23 +++++---- fs/f2fs/segment.c | 148 +++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 127 insertions(+), 44 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9df58163f29..9fb780317b4e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -178,7 +178,6 @@ enum { #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ -#define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ @@ -250,9 +249,10 @@ struct discard_entry { (MAX_PLIST_NUM - 1) : (blk_num - 1)) enum { - D_PREP, - D_SUBMIT, - D_DONE, + D_PREP, /* initial */ + D_PARTIAL, /* partially submitted */ + D_SUBMIT, /* all submitted */ + D_DONE, /* finished */ }; struct discard_info { @@ -277,7 +277,10 @@ struct discard_cmd { struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ + unsigned char issuing; /* issuing discard */ int error; /* bio error */ + spinlock_t lock; /* for state/bio_ref updating */ + unsigned short bio_ref; /* bio reference count */ }; enum { @@ -710,22 +713,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, } static inline bool __is_discard_mergeable(struct discard_info *back, - struct discard_info *front) + struct discard_info *front, unsigned int max_len) { return (back->lstart + back->len == front->lstart) && - (back->len + front->len < DEF_MAX_DISCARD_LEN); + (back->len + front->len <= max_len); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, - struct discard_info *back) + struct discard_info *back, unsigned int max_len) { - return __is_discard_mergeable(back, cur); + return __is_discard_mergeable(back, cur, max_len); } static inline bool __is_discard_front_mergeable(struct discard_info *cur, - struct discard_info *front) + struct discard_info *front, unsigned int max_len) { - return __is_discard_mergeable(cur, front); + return __is_discard_mergeable(cur, front, max_len); } static inline bool __is_extent_mergeable(struct extent_info *back, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index eada91dae08a..540d7d6161ba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -839,9 +839,12 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc->len = len; dc->ref = 0; dc->state = D_PREP; + dc->issuing = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); + spin_lock_init(&dc->lock); + dc->bio_ref = 0; atomic_inc(&dcc->discard_cmd_cnt); dcc->undiscard_blks += len; @@ -868,7 +871,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_dec(&dcc->issing_discard); + atomic_sub(dc->issuing, &dcc->issing_discard); list_del(&dc->list); rb_erase(&dc->rb_node, &dcc->root); @@ -883,9 +886,17 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned long flags; trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + spin_lock_irqsave(&dc->lock, flags); + if (dc->bio_ref) { + spin_unlock_irqrestore(&dc->lock, flags); + return; + } + spin_unlock_irqrestore(&dc->lock, flags); + f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) @@ -901,10 +912,17 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + unsigned long flags; dc->error = blk_status_to_errno(bio->bi_status); - dc->state = D_DONE; - complete_all(&dc->wait); + + spin_lock_irqsave(&dc->lock, flags); + dc->bio_ref--; + if (!dc->bio_ref && dc->state == D_SUBMIT) { + dc->state = D_DONE; + complete_all(&dc->wait); + } + spin_unlock_irqrestore(&dc->lock, flags); bio_put(bio); } @@ -972,17 +990,25 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } } - +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len); /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, - struct discard_cmd *dc) + struct discard_cmd *dc, + unsigned int *issued) { + struct block_device *bdev = dc->bdev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct bio *bio = NULL; int flag = dpolicy->sync ? REQ_SYNC : 0; + block_t lstart, start, len, total_len; + int err = 0; if (dc->state != D_PREP) return; @@ -990,30 +1016,81 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return; - trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); - - dc->error = __blkdev_issue_discard(dc->bdev, - SECTOR_FROM_BLOCK(dc->start), - SECTOR_FROM_BLOCK(dc->len), - GFP_NOFS, 0, &bio); - if (!dc->error) { - /* should keep before submission to avoid D_DONE right away */ - dc->state = D_SUBMIT; - atomic_inc(&dcc->issued_discard); - atomic_inc(&dcc->issing_discard); - if (bio) { + trace_f2fs_issue_discard(bdev, dc->start, dc->len); + + lstart = dc->lstart; + start = dc->start; + len = dc->len; + total_len = len; + + dc->len = 0; + + while (total_len && *issued < dpolicy->max_requests && !err) { + struct bio *bio = NULL; + unsigned long flags; + bool last = true; + + if (len > max_discard_blocks) { + len = max_discard_blocks; + last = false; + } + + (*issued)++; + if (*issued == dpolicy->max_requests) + last = true; + + dc->len += len; + + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(start), + SECTOR_FROM_BLOCK(len), + GFP_NOFS, 0, &bio); + if (!err && bio) { + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + atomic_inc(&dcc->issing_discard); + dc->issuing++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, start, start + len); + bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; bio->bi_opf |= flag; submit_bio(bio); - list_move_tail(&dc->list, wait_list); - __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, FS_DISCARD, 1); + } else { + spin_lock_irqsave(&dc->lock, flags); + if (dc->state == D_PARTIAL) + dc->state = D_SUBMIT; + spin_unlock_irqrestore(&dc->lock, flags); + + __remove_discard_cmd(sbi, dc); + err = -EIO; } - } else { - __remove_discard_cmd(sbi, dc); + + lstart += len; + start += len; + total_len -= len; + len = total_len; } + + if (len) + __update_discard_tree_range(sbi, bdev, lstart, start, len); } static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, @@ -1094,10 +1171,11 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); block_t end = lstart + len; - mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, lstart, (struct rb_entry **)&prev_dc, @@ -1137,7 +1215,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, if (prev_dc && prev_dc->state == D_PREP && prev_dc->bdev == bdev && - __is_discard_back_mergeable(&di, &prev_dc->di)) { + __is_discard_back_mergeable(&di, &prev_dc->di, + max_discard_blocks)) { prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); @@ -1148,7 +1227,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, if (next_dc && next_dc->state == D_PREP && next_dc->bdev == bdev && - __is_discard_front_mergeable(&di, &next_dc->di)) { + __is_discard_front_mergeable(&di, &next_dc->di, + max_discard_blocks)) { next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; @@ -1171,8 +1251,6 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, node = rb_next(&prev_dc->rb_node); next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } - - mutex_unlock(&dcc->cmd_lock); } static int __queue_discard_cmd(struct f2fs_sb_info *sbi, @@ -1187,7 +1265,9 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, blkstart -= FDEV(devi).start_blk; } + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); return 0; } @@ -1226,9 +1306,9 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, } dcc->next_pos = dc->lstart + dc->len; - __submit_discard_cmd(sbi, dpolicy, dc); + __submit_discard_cmd(sbi, dpolicy, dc, &issued); - if (++issued >= dpolicy->max_requests) + if (issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); @@ -1283,9 +1363,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, break; } - __submit_discard_cmd(sbi, dpolicy, dc); + __submit_discard_cmd(sbi, dpolicy, dc, &issued); - if (++issued >= dpolicy->max_requests) + if (issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); @@ -2492,9 +2572,9 @@ next: goto skip; } - __submit_discard_cmd(sbi, dpolicy, dc); + __submit_discard_cmd(sbi, dpolicy, dc, &issued); - if (++issued >= dpolicy->max_requests) { + if (issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; blk_finish_plug(&plug); -- cgit v1.2.3-59-g8ed1b From b83dcfe67142ea9f4bfaa1e9e21504be9e3c1bf7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Aug 2018 20:30:18 +0800 Subject: f2fs: support discard submission error injection This patch adds to support discard submission error injection for testing error handling of __submit_discard_cmd(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 8 ++++++++ fs/f2fs/super.c | 1 + 3 files changed, 10 insertions(+) (limited to 'fs/f2fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9fb780317b4e..95244e75dfc4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -56,6 +56,7 @@ enum { FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, + FAULT_DISCARD, FAULT_MAX, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 540d7d6161ba..f24e659463e9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1041,10 +1041,18 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_DISCARD)) { + f2fs_show_injection_info(FAULT_DISCARD); + err = -EIO; + goto submit; + } +#endif err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, 0, &bio); +submit: if (!err && bio) { /* * should keep before submission to avoid D_DONE diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bd57be470e23..30bd9138f39d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -55,6 +55,7 @@ char *f2fs_fault_name[FAULT_MAX] = { [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) -- cgit v1.2.3-59-g8ed1b From 6b9cb1242cb082044c8c3f8b9f35d9ada101dc41 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Aug 2018 10:14:55 +0800 Subject: f2fs: fix use-after-free of dicard command entry As Dan Carpenter reported: The patch 20ee4382322c: "f2fs: issue small discard by LBA order" from Jul 8, 2018, leads to the following Smatch warning: fs/f2fs/segment.c:1277 __issue_discard_cmd_orderly() warn: 'dc' was already freed. See also: fs/f2fs/segment.c:2550 __issue_discard_cmd_range() warn: 'dc' was already freed. In order to fix this issue, let's get error from __submit_discard_cmd(), and release current discard command after we referenced next one. Reported-by: Dan Carpenter Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 79 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 34 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f24e659463e9..6b932e669c57 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -994,7 +994,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static void __submit_discard_cmd(struct f2fs_sb_info *sbi, +static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, struct discard_cmd *dc, unsigned int *issued) @@ -1011,10 +1011,10 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, int err = 0; if (dc->state != D_PREP) - return; + return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - return; + return 0; trace_f2fs_issue_discard(bdev, dc->start, dc->len); @@ -1053,43 +1053,44 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(len), GFP_NOFS, 0, &bio); submit: - if (!err && bio) { - /* - * should keep before submission to avoid D_DONE - * right away - */ + if (err) { spin_lock_irqsave(&dc->lock, flags); - if (last) + if (dc->state == D_PARTIAL) dc->state = D_SUBMIT; - else - dc->state = D_PARTIAL; - dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); - atomic_inc(&dcc->issing_discard); - dc->issuing++; - list_move_tail(&dc->list, wait_list); + break; + } - /* sanity check on discard range */ - __check_sit_bitmap(sbi, start, start + len); + f2fs_bug_on(sbi, !bio); - bio->bi_private = dc; - bio->bi_end_io = f2fs_submit_discard_endio; - bio->bi_opf |= flag; - submit_bio(bio); + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); - atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + dc->issuing++; + list_move_tail(&dc->list, wait_list); - f2fs_update_iostat(sbi, FS_DISCARD, 1); - } else { - spin_lock_irqsave(&dc->lock, flags); - if (dc->state == D_PARTIAL) - dc->state = D_SUBMIT; - spin_unlock_irqrestore(&dc->lock, flags); + /* sanity check on discard range */ + __check_sit_bitmap(sbi, start, start + len); - __remove_discard_cmd(sbi, dc); - err = -EIO; - } + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= flag; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); lstart += len; start += len; @@ -1097,8 +1098,9 @@ submit: len = total_len; } - if (len) + if (!err && len) __update_discard_tree_range(sbi, bdev, lstart, start, len); + return err; } static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, @@ -1304,6 +1306,7 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, while (dc) { struct rb_node *node; + int err = 0; if (dc->state != D_PREP) goto next; @@ -1314,12 +1317,14 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, } dcc->next_pos = dc->lstart + dc->len; - __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); } @@ -2571,6 +2576,7 @@ next: while (dc && dc->lstart <= end) { struct rb_node *node; + int err = 0; if (dc->len < dpolicy->granularity) goto skip; @@ -2580,11 +2586,14 @@ next: goto skip; } - __submit_discard_cmd(sbi, dpolicy, dc, &issued); + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; + if (err) + __remove_discard_cmd(sbi, dc); + blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); @@ -2593,6 +2602,8 @@ next: } skip: node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); if (fatal_signal_pending(current)) -- cgit v1.2.3-59-g8ed1b From 3f16ecd950e56d60a574f73af9538f6e24030f9a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Aug 2018 17:36:29 +0800 Subject: f2fs: fix to return success when trimming meta area generic/251 --- tests/generic/251.out 2016-05-03 20:20:11.381899000 +0800 QA output created by 251 Running the test: done. +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument +fstrim: /mnt/scratch_f2fs: FITRIM ioctl failed: Invalid argument ... Ran: generic/251 Failures: generic/251 The reason is coverage of fstrim locates in meta area, previously we just return -EINVAL for such case, making generic/251 failed, to fix this problem, let's relieve restriction to return success with no block discarded. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6b932e669c57..63fc647f9ac2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2631,8 +2631,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; - if (end <= MAIN_BLKADDR(sbi)) - return -EINVAL; + if (end < MAIN_BLKADDR(sbi)) + goto out; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_msg(sbi->sb, KERN_WARNING, -- cgit v1.2.3-59-g8ed1b From d494500a70434223bc35f862fab0679b13bea23d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Aug 2018 17:36:41 +0800 Subject: f2fs: support fault_type mount option Previously, once fault injection is on, by default, all kind of faults will be injected to f2fs, if we want to trigger single or specified combined type during the test, we need to configure sysfs entry, it will be a little inconvenient to integrate sysfs configuring into testsuit, such as xfstest. So this patch introduces a new mount option 'fault_type' to assist old option 'fault_injection', with these two mount options, we can specify any fault rate/type at mount-time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 18 ++++++++++++++++++ fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 7 +++++-- fs/f2fs/super.c | 34 +++++++++++++++++++++++++++------- 4 files changed, 51 insertions(+), 10 deletions(-) (limited to 'fs/f2fs') diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 69f8de995739..e5edd29687b5 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,24 @@ data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. fault_injection=%d Enable fault injection in all supported types with specified injection rate. +fault_type=%d Support configuring fault injection type, should be + enabled with fault_injection option, fault type value + is shown below, it supports single or combined type. + Type_Name Type_Value + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 31561026ac9a..3ab7a00c0641 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -28,7 +28,7 @@ struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { - f2fs_build_fault_attr(sbi, 0); + f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); if (!end_io) f2fs_flush_merged_writes(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 95244e75dfc4..375aa9f30cfa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -60,6 +60,8 @@ enum { FAULT_MAX, }; +#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) + struct f2fs_fault_info { atomic_t inject_ops; unsigned int inject_rate; @@ -3435,9 +3437,10 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) } #ifdef CONFIG_F2FS_FAULT_INJECTION -extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate); +extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type); #else -#define f2fs_build_fault_attr(sbi, rate) do { } while (0) +#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 30bd9138f39d..be41dbd7b261 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -58,17 +58,21 @@ char *f2fs_fault_name[FAULT_MAX] = { [FAULT_DISCARD] = "discard error", }; -void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) +void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); ffi->inject_rate = rate; - ffi->inject_type = (1 << FAULT_MAX) - 1; - } else { - memset(ffi, 0, sizeof(struct f2fs_fault_info)); } + + if (type) + ffi->inject_type = type; + + if (!rate && !type) + memset(ffi, 0, sizeof(struct f2fs_fault_info)); } #endif @@ -113,6 +117,7 @@ enum { Opt_mode, Opt_io_size_bits, Opt_fault_injection, + Opt_fault_type, Opt_lazytime, Opt_nolazytime, Opt_quota, @@ -170,6 +175,7 @@ static match_table_t f2fs_tokens = { {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, + {Opt_fault_type, "fault_type=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, {Opt_quota, "quota"}, @@ -600,7 +606,18 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(sbi, arg); + f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); + set_opt(sbi, FAULT_INJECTION); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_fault_type: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0, arg); set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, @@ -1321,9 +1338,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION - if (test_opt(sbi, FAULT_INJECTION)) + if (test_opt(sbi, FAULT_INJECTION)) { seq_printf(seq, ",fault_injection=%u", F2FS_OPTION(sbi).fault_info.inject_rate); + seq_printf(seq, ",fault_type=%u", + F2FS_OPTION(sbi).fault_info.inject_type); + } #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) @@ -1393,7 +1413,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, POSIX_ACL); #endif - f2fs_build_fault_attr(sbi, 0); + f2fs_build_fault_attr(sbi, 0, 0); } #ifdef CONFIG_QUOTA -- cgit v1.2.3-59-g8ed1b From 7fa750a163089cf96866de402314d853a96cb342 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Aug 2018 23:38:06 +0200 Subject: f2fs: rework fault injection handling to avoid a warning When CONFIG_F2FS_FAULT_INJECTION is disabled, we get a warning about an unused label: fs/f2fs/segment.c: In function '__submit_discard_cmd': fs/f2fs/segment.c:1059:1: error: label 'submit' defined but not used [-Werror=unused-label] This could be fixed by adding another #ifdef around it, but the more reliable way of doing this seems to be to remove the other #ifdefs where that is easily possible. By defining time_to_inject() as a trivial stub, most of the checks for CONFIG_F2FS_FAULT_INJECTION can go away. This also leads to nicer formatting of the code. Signed-off-by: Arnd Bergmann Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/data.c | 2 -- fs/f2fs/dir.c | 3 +-- fs/f2fs/f2fs.h | 50 +++++++++++++++++++++++++------------------------- fs/f2fs/file.c | 3 +-- fs/f2fs/gc.c | 2 -- fs/f2fs/inode.c | 3 +-- fs/f2fs/node.c | 3 +-- fs/f2fs/recovery.c | 5 ++--- fs/f2fs/segment.c | 4 ---- 10 files changed, 32 insertions(+), 46 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3ab7a00c0641..e8b6b89bddb8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -555,13 +555,12 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) spin_lock(&im->ino_lock); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } -#endif + if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 45f043ee48bd..43d3723dc886 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -126,12 +126,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); bio->bi_status = BLK_STS_IOERR; } -#endif if (f2fs_bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7f955c4e86a4..ecc3a4e2be96 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -517,12 +517,11 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, } start: -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; } -#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 375aa9f30cfa..9a6ba4a8d338 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -41,7 +41,6 @@ } while (0) #endif -#ifdef CONFIG_F2FS_FAULT_INJECTION enum { FAULT_KMALLOC, FAULT_KVMALLOC, @@ -60,6 +59,7 @@ enum { FAULT_MAX, }; +#ifdef CONFIG_F2FS_FAULT_INJECTION #define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) struct f2fs_fault_info { @@ -1324,6 +1324,12 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) } return false; } +#else +#define f2fs_show_injection_info(type) do { } while (0) +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + return false; +} #endif /* For write statistics. Suppose sector size is 512 bytes, @@ -1676,13 +1682,12 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (ret) return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); release = *count; goto enospc; } -#endif + /* * let's increase this in prior to actual block count change in order * for f2fs_sync_file to avoid data races when deciding checkpoint. @@ -1891,12 +1896,10 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return ret; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); goto enospc; } -#endif spin_lock(&sbi->stat_lock); @@ -1981,22 +1984,23 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { -#ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page; - if (!for_write) - page = find_get_page_flags(mapping, index, - FGP_LOCK | FGP_ACCESSED); - else - page = find_lock_page(mapping, index); - if (page) - return page; + if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + if (!for_write) + page = find_get_page_flags(mapping, index, + FGP_LOCK | FGP_ACCESSED); + else + page = find_lock_page(mapping, index); + if (page) + return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(FAULT_PAGE_ALLOC); - return NULL; + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); + return NULL; + } } -#endif + if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -2006,12 +2010,11 @@ static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp_mask) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { f2fs_show_injection_info(FAULT_PAGE_GET); return NULL; } -#endif + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } @@ -2076,12 +2079,11 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); return bio; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { f2fs_show_injection_info(FAULT_ALLOC_BIO); return NULL; } -#endif + return bio_alloc(GFP_KERNEL, npages); } @@ -2616,12 +2618,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_KMALLOC)) { f2fs_show_injection_info(FAULT_KMALLOC); return NULL; } -#endif + return kmalloc(size, flags); } @@ -2634,12 +2635,11 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_KVMALLOC)) { f2fs_show_injection_info(FAULT_KVMALLOC); return NULL; } -#endif + return kvmalloc(size, flags); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8c4694b9af27..1f76cc3fc46b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -668,12 +668,11 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { f2fs_show_injection_info(FAULT_TRUNCATE); return -EIO; } -#endif + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e352fbd33848..76a22b3773bc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -53,12 +53,10 @@ static int gc_thread_func(void *data) continue; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } -#endif if (!sb_start_write_trylock(sbi->sb)) continue; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index fc2c98b9e255..6908896a1950 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -659,12 +659,11 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_EVICT_INODE)) { f2fs_show_injection_info(FAULT_EVICT_INODE); err = -EIO; } -#endif + if (!err) { f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 472dd643b074..dd2e45a661aa 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2323,12 +2323,11 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ALLOC_NID)) { f2fs_show_injection_info(FAULT_ALLOC_NID); return false; } -#endif + spin_lock(&nm_i->nid_list_lock); if (unlikely(nm_i->available_nids == 0)) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 64e5a59a270a..95511ed11a22 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -518,10 +518,9 @@ retry_dn: if (src == NULL_ADDR) { err = f2fs_reserve_new_block(&dn); -#ifdef CONFIG_F2FS_FAULT_INJECTION - while (err) + while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) err = f2fs_reserve_new_block(&dn); -#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); if (err) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 63fc647f9ac2..b136e39e1e9e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -470,12 +470,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } -#endif /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) @@ -1041,13 +1039,11 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(FAULT_DISCARD); err = -EIO; goto submit; } -#endif err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), -- cgit v1.2.3-59-g8ed1b From dda9f4b9cac6bdd2a96253b4444d7a6ce5132edb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 11 Aug 2018 23:42:09 +0800 Subject: f2fs: fix to skip verifying block address for non-regular inode generic/184 1s ... [failed, exit status 1]- output mismatch --- tests/generic/184.out 2015-01-11 16:52:27.643681072 +0800 QA output created by 184 - silence is golden +rm: cannot remove '/mnt/f2fs/null': Bad address +mknod: '/mnt/f2fs/null': Bad address +chmod: cannot access '/mnt/f2fs/null': Bad address +./tests/generic/184: line 36: /mnt/f2fs/null: Bad address ... F2FS-fs (zram0): access invalid blkaddr:259 EIP: f2fs_is_valid_blkaddr+0x14b/0x1b0 [f2fs] f2fs_iget+0x927/0x1010 [f2fs] f2fs_lookup+0x26e/0x630 [f2fs] __lookup_slow+0xb3/0x140 lookup_slow+0x31/0x50 walk_component+0x185/0x1f0 path_lookupat+0x51/0x190 filename_lookup+0x7f/0x140 user_path_at_empty+0x36/0x40 vfs_statx+0x61/0xc0 __do_sys_stat64+0x29/0x40 sys_stat64+0x13/0x20 do_fast_syscall_32+0xaa/0x22c entry_SYSENTER_32+0x53/0x86 In f2fs_iget(), we will check inode's first block address, if it is valid, we will set FI_FIRST_BLOCK_WRITTEN flag in inode. But we should only do this for regular inode, otherwise, like special inode, i_addr[0] is used for storing device info instead of block address, it will fail checking flow obviously. So for non-regular inode, let's skip verifying address and setting flag. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6908896a1950..959df2249875 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -371,13 +371,15 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); - err = __written_first_block(sbi, ri); - if (err < 0) { - f2fs_put_page(node_page, 1); - return err; + if (S_ISREG(inode->i_mode)) { + err = __written_first_block(sbi, ri); + if (err < 0) { + f2fs_put_page(node_page, 1); + return err; + } + if (!err) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } - if (!err) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; -- cgit v1.2.3-59-g8ed1b From 853137cef46ccc490e6fd4b160a1c252d6459842 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Aug 2018 17:53:34 -0700 Subject: f2fs: fix performance issue observed with multi-thread sequential read This reverts the commit - "b93f771 - f2fs: remove writepages lock" to fix the drop in sequential read throughput. Test: ./tiotest -t 32 -d /data/tio_tmp -f 32 -b 524288 -k 1 -k 3 -L device: UFS Before - read throughput: 185 MB/s total read requests: 85177 (of these ~80000 are 4KB size requests). total write requests: 2546 (of these ~2208 requests are written in 512KB). After - read throughput: 758 MB/s total read requests: 2417 (of these ~2042 are 512KB reads). total write requests: 2701 (of these ~2034 requests are written in 512KB). Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++ fs/f2fs/data.c | 21 +++++++++++++++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 1 + fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 2 ++ 6 files changed, 35 insertions(+) (limited to 'fs/f2fs') diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9b0123388f18..94a24aedcdb2 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -51,6 +51,14 @@ Description: Controls the dirty page count condition for the in-place-update policies. +What: /sys/fs/f2fs//min_seq_blocks +Date: August 2018 +Contact: "Jaegeuk Kim" +Description: + Controls the dirty page count condition for batched sequential + writes in ->writepages. + + What: /sys/fs/f2fs//min_hot_blocks Date: March 2017 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 43d3723dc886..bdcb023506a7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2122,6 +2122,18 @@ continue_unlock: return ret; } +static inline bool __should_serialize_io(struct inode *inode, + struct writeback_control *wbc) +{ + if (!S_ISREG(inode->i_mode)) + return false; + if (wbc->sync_mode != WB_SYNC_ALL) + return true; + if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) + return true; + return false; +} + static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) @@ -2130,6 +2142,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct blk_plug plug; int ret; + bool locked = false; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -2160,10 +2173,18 @@ static int __f2fs_write_data_pages(struct address_space *mapping, else if (atomic_read(&sbi->wb_sync_req[DATA])) goto skip_write; + if (__should_serialize_io(inode, wbc)) { + mutex_lock(&sbi->writepages); + locked = true; + } + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + if (locked) + mutex_unlock(&sbi->writepages); + if (wbc->sync_mode == WB_SYNC_ALL) atomic_dec(&sbi->wb_sync_req[DATA]); /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a6ba4a8d338..170573f8a04a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -913,6 +913,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_seq_blocks; /* threshold for sequential blocks */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ @@ -1133,6 +1134,7 @@ struct f2fs_sb_info { struct rw_semaphore sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ + struct mutex writepages; /* mutex for writepages() */ #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b136e39e1e9e..20650e25117b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4127,6 +4127,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index be41dbd7b261..53d70b64fea1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2842,6 +2842,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); init_rwsem(&sbi->node_change); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index cd2e030e47b8..81c0e5337443 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -397,6 +397,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); @@ -449,6 +450,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_seq_blocks), ATTR_LIST(min_hot_blocks), ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), -- cgit v1.2.3-59-g8ed1b From 6f8d4455060dfb0e32dfb8e685b97caf4ed1be41 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 25 Jul 2018 12:11:56 +0900 Subject: f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc The f2fs_gc() called by f2fs_balance_fs() requires to be called outside of fi->i_gc_rwsem[WRITE], since f2fs_gc() can try to grab it in a loop. If it hits the miximum retrials in GC, let's give a chance to release gc_mutex for a short time in order not to go into live lock in the worst case. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 119 ++++++++++++++++++++++++++++-------------------------- fs/f2fs/gc.c | 26 +++++++++--- fs/f2fs/segment.c | 6 ++- fs/f2fs/segment.h | 2 +- 6 files changed, 91 insertions(+), 67 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bdcb023506a7..e73ce11de02d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2217,14 +2217,14 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) loff_t i_size = i_size_read(inode); if (to > i_size) { - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 170573f8a04a..96bde026636f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1243,6 +1243,7 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1f76cc3fc46b..5474aaa274b9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -797,8 +797,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { bool to_smaller = (attr->ia_size <= i_size_read(inode)); - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); @@ -808,8 +808,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -962,8 +962,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; - down_write(&F2FS_I(inode)->i_mmap_sem); down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -972,8 +972,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1188,25 +1188,33 @@ roll_back: return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; int ret; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - f2fs_drop_extent_tree(inode); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) { - pgoff_t pg_start, pg_end; loff_t new_size; int ret; @@ -1221,25 +1229,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_SHIFT; - pg_end = (offset + len) >> PAGE_SHIFT; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out_unlock; - - truncate_pagecache(inode, offset); + return ret; - ret = f2fs_do_collapse(inode, pg_start, pg_end); + ret = f2fs_do_collapse(inode, offset, len); if (ret) - goto out_unlock; + return ret; /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); @@ -1247,11 +1247,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, new_size); ret = f2fs_truncate_blocks(inode, new_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out_unlock: - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1317,10 +1315,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - goto out_sem; + return ret; pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1332,7 +1329,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, offset + len); } else { @@ -1340,7 +1337,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1352,6 +1349,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, pgoff_t end; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1363,6 +1361,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1374,6 +1373,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1402,9 +1402,6 @@ out: else f2fs_i_size_write(inode, new_size); } -out_sem: - up_write(&F2FS_I(inode)->i_mmap_sem); - return ret; } @@ -1433,26 +1430,27 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) - goto out; + return ret; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out; - - truncate_pagecache(inode, offset); + return ret; pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + truncate_pagecache(inode, offset); + while (!ret && idx > pg_start) { nr = idx - pg_start; if (nr > delta) @@ -1466,16 +1464,17 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out: - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1722,8 +1721,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1734,6 +1731,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (!get_dirty_pages(inode)) goto skip_flush; @@ -1741,18 +1740,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; + } skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1770,9 +1771,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - inode_lock(inode); + f2fs_balance_fs(F2FS_I_SB(inode), true); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + inode_lock(inode); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1798,7 +1799,6 @@ err_out: clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); ret = -EINVAL; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2394,15 +2394,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { - inode_unlock(dst); - goto out; - } } ret = -EINVAL; @@ -2447,6 +2442,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out_unlock; f2fs_balance_fs(sbi, true); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + f2fs_lock_op(sbi); ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, pos_out >> F2FS_BLKSIZE_BITS, @@ -2459,13 +2462,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_i_size_write(dst, dst_osize); } f2fs_unlock_op(sbi); -out_unlock: - if (src != dst) { + + if (src != dst) up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); +out_unlock: + if (src != dst) inode_unlock(dst); - } out: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 76a22b3773bc..c598ae5ecbfa 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -882,6 +882,7 @@ next_step: if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); + sbi->skipped_gc_rwsem++; continue; } @@ -911,6 +912,7 @@ next_step: continue; if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; up_write(&fi->i_gc_rwsem[READ]); continue; } @@ -1048,6 +1050,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, @@ -1060,6 +1063,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; + first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1101,7 +1106,8 @@ gc_more: total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1110,15 +1116,23 @@ gc_more: if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && - skipped_round * 2 >= round) - f2fs_drop_inmem_pages_all(sbi, true); + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { segno = NULL_SEGNO; goto gc_more; } + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } if (gc_type == FG_GC) ret = f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 20650e25117b..7dcfe38e70cc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -445,8 +445,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) int err; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + down_write(&fi->i_gc_rwsem[WRITE]); + + f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); mutex_lock(&fi->inmem_lock); @@ -461,6 +463,8 @@ int f2fs_commit_inmem_pages(struct inode *inode) clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); + up_write(&fi->i_gc_rwsem[WRITE]); + return err; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 50495515f0a0..b3d9e317ff0c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,7 +215,7 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) -#define MAX_SKIP_ATOMIC_COUNT 16 +#define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; -- cgit v1.2.3-59-g8ed1b From 6aa58d8ad20a3323f42274c25820a6f54192422d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Aug 2018 22:37:25 +0800 Subject: f2fs: readahead encrypted block during GC During GC, for each encrypted block, we will read block synchronously into meta page, and then submit it into current cold data log area. So this block read model with 4k granularity can make poor performance, like migrating non-encrypted block, let's readahead encrypted block as well to improve migration performance. To implement this, we choose meta page that its index is old block address of the encrypted block, and readahead ciphertext into this page, later, if readaheaded page is still updated, we will load its data into target meta page, and submit the write IO. Note that for OPU, truncation, deletion, we need to invalid meta page after we invalid old block address, to make sure we won't load invalid data from target meta page during encrypted block migration. for ((i = 0; i < 1000; i++)) do { xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync"; } done for ((i = 0; i < 1000; i+=2)) do { rm /mnt/f2fs/dir/$i; } done ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0); Before: gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc] gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1 gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc] gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc] gc-6549 [001] .... 214682.213902: block_plug: [gc] gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc] gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1 gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc] gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc] gc-6549 [001] .... 214682.226414: block_plug: [gc] gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc] gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1 gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc] gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc] gc-6549 [001] .... 214682.226911: block_plug: [gc] gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc] gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1 After: gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc] gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc] gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc] gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc] gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc] gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc] gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc] gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc] gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc] gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc] gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc] gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc] gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc] gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc] gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc] gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc] gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc] gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc] gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc] gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1 gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc] gc-5678 [003] .... 214327.026046: block_plug: [gc] Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 35 +++++++++++------ fs/f2fs/gc.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++----- fs/f2fs/segment.c | 10 ++++- 3 files changed, 134 insertions(+), 22 deletions(-) (limited to 'fs/f2fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e73ce11de02d..382c1ef9a9e4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -875,6 +875,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; + block_t old_blkaddr; pgoff_t fofs; blkcnt_t count = 1; int err; @@ -896,9 +897,12 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - - f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + old_blkaddr = dn->data_blkaddr; + f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL, false); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); f2fs_set_data_blkaddr(dn); /* update i_size */ @@ -1614,6 +1618,7 @@ static int f2fs_read_data_pages(struct file *file, static int encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; + struct page *mpage; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) @@ -1625,17 +1630,25 @@ static int encrypt_one_page(struct f2fs_io_info *fio) retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, PAGE_SIZE, 0, fio->page->index, gfp_flags); - if (!IS_ERR(fio->encrypted_page)) - return 0; + if (IS_ERR(fio->encrypted_page)) { + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); + } - /* flush pending IOs and wait for a while in the ENOMEM case */ - if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { - f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - goto retry_encrypt; + mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); + if (mpage) { + if (PageUptodate(mpage)) + memcpy(page_address(mpage), + page_address(fio->encrypted_page), PAGE_SIZE); + f2fs_put_page(mpage, 1); } - return PTR_ERR(fio->encrypted_page); + return 0; } static inline bool check_inplace_update_policy(struct inode *inode, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c598ae5ecbfa..5c8d00422237 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -599,6 +599,72 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } +static int ra_data_block(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei = {0, 0, 0}; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = false, + .retry = false, + }; + int err; + + page = f2fs_grab_cache_page(mapping, index, true); + if (!page) + return -ENOMEM; + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + goto put_page; + f2fs_put_dnode(&dn); + + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC))) { + err = -EFAULT; + goto put_page; + } +got_it: + /* read page */ + fio.page = page; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), + dn.data_blkaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto put_page; + } + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_encrypted_page; + f2fs_put_page(fio.encrypted_page, 0); + f2fs_put_page(page, 1); + return 0; +put_encrypted_page: + f2fs_put_page(fio.encrypted_page, 1); +put_page: + f2fs_put_page(page, 1); + return err; +} + /* * Move data block via META_MAPPING while keeping locked data page. * This can be used to move blocks, aka LBAs, directly on disk. @@ -620,7 +686,7 @@ static void move_data_block(struct inode *inode, block_t bidx, struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; - struct page *page; + struct page *page, *mpage; block_t newaddr; int err; bool lfs_mode = test_opt(fio.sbi, LFS); @@ -683,6 +749,23 @@ static void move_data_block(struct inode *inode, block_t bidx, goto recover_block; } + mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + fio.old_blkaddr, FGP_LOCK, GFP_NOFS); + if (mpage) { + bool updated = false; + + if (PageUptodate(mpage)) { + memcpy(page_address(fio.encrypted_page), + page_address(mpage), PAGE_SIZE); + updated = true; + } + f2fs_put_page(mpage, 1); + invalidate_mapping_pages(META_MAPPING(fio.sbi), + fio.old_blkaddr, fio.old_blkaddr); + if (updated) + goto write_page; + } + err = f2fs_submit_page_bio(&fio); if (err) goto put_page_out; @@ -699,6 +782,7 @@ static void move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } +write_page: set_page_dirty(fio.encrypted_page); f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -873,12 +957,6 @@ next_step: if (IS_ERR(inode) || is_bad_inode(inode)) continue; - /* if inode uses special I/O path, let's go phase 3 */ - if (f2fs_post_read_required(inode)) { - add_gc_inode(gc_list, inode); - continue; - } - if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); @@ -886,10 +964,23 @@ next_step: continue; } - start_bidx = f2fs_start_bidx_of_node(nofs, inode); + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + + if (f2fs_post_read_required(inode)) { + int err = ra_data_block(inode, start_bidx); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (err) { + iput(inode); + continue; + } + add_gc_inode(gc_list, inode); + continue; + } + data_page = f2fs_get_read_data_page(inode, - start_bidx + ofs_in_node, REQ_RAHEAD, - true); + start_bidx, REQ_RAHEAD, true); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7dcfe38e70cc..30779aaa9dba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2079,6 +2079,8 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) if (addr == NEW_ADDR) return; + invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -2978,6 +2980,9 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(fio->sbi), + fio->old_blkaddr, fio->old_blkaddr); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3132,8 +3137,11 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!recover_curseg || recover_newaddr) update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); update_sit_entry(sbi, old_blkaddr, -1); + } locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); -- cgit v1.2.3-59-g8ed1b