diff options
Diffstat (limited to 'fs/f2fs')
-rw-r--r-- | fs/f2fs/Kconfig | 60 | ||||
-rw-r--r-- | fs/f2fs/Makefile | 2 | ||||
-rw-r--r-- | fs/f2fs/acl.c | 51 | ||||
-rw-r--r-- | fs/f2fs/acl.h | 7 | ||||
-rw-r--r-- | fs/f2fs/checkpoint.c | 518 | ||||
-rw-r--r-- | fs/f2fs/compress.c | 1457 | ||||
-rw-r--r-- | fs/f2fs/data.c | 2140 | ||||
-rw-r--r-- | fs/f2fs/debug.c | 215 | ||||
-rw-r--r-- | fs/f2fs/dir.c | 542 | ||||
-rw-r--r-- | fs/f2fs/extent_cache.c | 108 | ||||
-rw-r--r-- | fs/f2fs/f2fs.h | 1676 | ||||
-rw-r--r-- | fs/f2fs/file.c | 2627 | ||||
-rw-r--r-- | fs/f2fs/gc.c | 1009 | ||||
-rw-r--r-- | fs/f2fs/gc.h | 98 | ||||
-rw-r--r-- | fs/f2fs/hash.c | 89 | ||||
-rw-r--r-- | fs/f2fs/inline.c | 151 | ||||
-rw-r--r-- | fs/f2fs/inode.c | 183 | ||||
-rw-r--r-- | fs/f2fs/iostat.c | 320 | ||||
-rw-r--r-- | fs/f2fs/iostat.h | 84 | ||||
-rw-r--r-- | fs/f2fs/namei.c | 304 | ||||
-rw-r--r-- | fs/f2fs/node.c | 591 | ||||
-rw-r--r-- | fs/f2fs/node.h | 52 | ||||
-rw-r--r-- | fs/f2fs/recovery.c | 208 | ||||
-rw-r--r-- | fs/f2fs/segment.c | 1778 | ||||
-rw-r--r-- | fs/f2fs/segment.h | 167 | ||||
-rw-r--r-- | fs/f2fs/shrinker.c | 6 | ||||
-rw-r--r-- | fs/f2fs/super.c | 1699 | ||||
-rw-r--r-- | fs/f2fs/sysfs.c | 785 | ||||
-rw-r--r-- | fs/f2fs/trace.c | 165 | ||||
-rw-r--r-- | fs/f2fs/trace.h | 43 | ||||
-rw-r--r-- | fs/f2fs/verity.c | 142 | ||||
-rw-r--r-- | fs/f2fs/xattr.c | 168 | ||||
-rw-r--r-- | fs/f2fs/xattr.h | 17 |
33 files changed, 12063 insertions, 5399 deletions
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index f0faada30f30..03ef087537c7 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,14 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select FS_IOMAP + select LZ4_COMPRESS if F2FS_FS_LZ4 + select LZ4_DECOMPRESS if F2FS_FS_LZ4 + select LZ4HC_COMPRESS if F2FS_FS_LZ4HC + select LZO_COMPRESS if F2FS_FS_LZO + select LZO_DECOMPRESS if F2FS_FS_LZO + select ZSTD_COMPRESS if F2FS_FS_ZSTD + select ZSTD_DECOMPRESS if F2FS_FS_ZSTD help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -76,16 +84,6 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. -config F2FS_IO_TRACE - bool "F2FS IO tracer" - depends on F2FS_FS - depends on FUNCTION_TRACER - help - F2FS IO trace is based on a function trace, which gathers process - information and block IO patterns in the filesystem level. - - If unsure, say N. - config F2FS_FAULT_INJECTION bool "F2FS fault injection facility" depends on F2FS_FS @@ -104,17 +102,51 @@ config F2FS_FS_COMPRESSION config F2FS_FS_LZO bool "LZO compression support" depends on F2FS_FS_COMPRESSION - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO compress algorithm, if unsure, say Y. +config F2FS_FS_LZORLE + bool "LZO-RLE compression support" + depends on F2FS_FS_LZO + default y + help + Support LZO-RLE compress algorithm, if unsure, say Y. + config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION - select LZ4_COMPRESS - select LZ4_DECOMPRESS default y help Support LZ4 compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_LZ4 + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + +config F2FS_FS_ZSTD + bool "ZSTD compression support" + depends on F2FS_FS_COMPRESSION + default y + help + Support ZSTD compress algorithm, if unsure, say Y. + +config F2FS_IOSTAT + bool "F2FS IO statistics information" + depends on F2FS_FS + default y + help + Support getting IO statistics through sysfs and printing out periodic + IO statistics tracepoint events. You have to turn on "iostat_enable" + sysfs node to enable this feature. + +config F2FS_UNFAIR_RWSEM + bool "F2FS unfair rw_semaphore" + depends on F2FS_FS && BLK_CGROUP + help + Use unfair rw_semaphore, if system configured IO priority by block + cgroup. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ee7316b42f69..8a7322d229e4 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,6 +7,6 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o +f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 217b290ae3a5..5bbc44a5216e 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count) static inline int f2fs_acl_count(size_t size) { ssize_t s; + size -= sizeof(struct f2fs_acl_header); s = size - 4 * sizeof(struct f2fs_acl_entry_short); if (s < 0) { @@ -160,7 +161,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, return (void *)f2fs_acl; fail: - kvfree(f2fs_acl); + kfree(f2fs_acl); return ERR_PTR(-EINVAL); } @@ -190,17 +191,43 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, acl = NULL; else acl = ERR_PTR(retval); - kvfree(value); + kfree(value); return acl; } -struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) { + if (rcu) + return ERR_PTR(-ECHILD); + return __f2fs_get_acl(inode, type, NULL); } -static int __f2fs_set_acl(struct inode *inode, int type, +static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, + struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + +static int __f2fs_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { int name_index; @@ -213,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(mnt_userns, inode, + &mode, &acl); if (error) return error; set_acl_inode(inode, mode); @@ -240,7 +268,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); - kvfree(value); + kfree(value); if (!error) set_cached_acl(inode, type, acl); @@ -248,12 +276,13 @@ static int __f2fs_set_acl(struct inode *inode, int type, return error; } -int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - return __f2fs_set_acl(inode, type, acl, NULL); + return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL); } /* @@ -384,7 +413,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, struct page *dpage) { struct posix_acl *default_acl = NULL, *acl = NULL; - int error = 0; + int error; error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); if (error) @@ -393,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { - error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl, ipage); posix_acl_release(default_acl); } else { @@ -401,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, } if (acl) { if (!error) - error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl, ipage); posix_acl_release(acl); } else { diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index b96823c59b15..a26e33cab4ff 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/acl.h * @@ -33,8 +33,9 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); +extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); +extern int f2fs_set_acl(struct user_namespace *, struct inode *, + struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 44e84ac5c941..0c82dae082aa 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -13,22 +13,29 @@ #include <linux/f2fs_fs.h> #include <linux/pagevec.h> #include <linux/swap.h> +#include <linux/kthread.h> #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" +#include "iostat.h" #include <trace/events/f2fs.h> +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) + if (!end_io) { f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } } /* @@ -37,7 +44,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page = NULL; + struct page *page; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { @@ -50,9 +57,6 @@ repeat: return page; } -/* - * We guarantee no failure on the returned page. - */ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { @@ -89,6 +93,8 @@ repeat: return ERR_PTR(err); } + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); + lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); @@ -96,6 +102,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -108,7 +115,7 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, true); } -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; int count = 0; @@ -119,7 +126,7 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } @@ -137,7 +144,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int segno, offset; bool exist; - if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ) + if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); @@ -145,11 +152,18 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return exist; + } + if (!exist && type == DATA_GENERIC_ENHANCE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); } return exist; } @@ -182,12 +196,13 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); @@ -206,7 +221,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, } /* - * Readahead CP/NAT/SIT/SSA pages + * Readahead CP/NAT/SIT/SSA/POR pages */ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) @@ -223,6 +238,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .is_por = (type == META_POR), }; struct blk_plug plug; + int err; if (unlikely(type == META_POR)) fio.op_flags &= ~REQ_META; @@ -243,6 +259,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: + if (unlikely(blkno >= TOTAL_SEGS(sbi))) + goto out; /* get sit block addr */ fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); @@ -266,26 +284,34 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - f2fs_submit_page_bio(&fio); - f2fs_put_page(page, 0); + err = f2fs_submit_page_bio(&fio); + f2fs_put_page(page, err ? 1 : 0); + + if (!err) + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); } out: blk_finish_plug(&plug); return blkno - start; } -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks) { struct page *page; bool readahead = false; + if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) + return; + page = find_get_page(META_MAPPING(sbi), index); if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); if (readahead) - f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, @@ -343,13 +369,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!mutex_trylock(&sbi->cp_mutex)) + if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - mutex_unlock(&sbi->cp_mutex); + f2fs_up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -428,47 +454,57 @@ stop: return nwritten; } -static int f2fs_set_meta_page_dirty(struct page *page) +static bool f2fs_dirty_meta_folio(struct address_space *mapping, + struct folio *folio) { - trace_f2fs_set_page_dirty(page, META); + trace_f2fs_set_page_dirty(&folio->page, META); - if (!PageUptodate(page)) - SetPageUptodate(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); - return 1; + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + if (filemap_dirty_folio(mapping, folio)) { + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); + set_page_private_reference(&folio->page); + return true; } - return 0; + return false; } const struct address_space_operations f2fs_meta_aops = { .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, - .set_page_dirty = f2fs_set_meta_page_dirty, - .invalidatepage = f2fs_invalidate_page, - .releasepage = f2fs_release_page, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .dirty_folio = f2fs_dirty_meta_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .migrate_folio = filemap_migrate_folio, }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; - struct ino_entry *e, *tmp; + struct ino_entry *e = NULL, *new = NULL; + + if (type == FLUSH_INO) { + rcu_read_lock(); + e = radix_tree_lookup(&im->ino_root, ino); + rcu_read_unlock(); + } - tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); +retry: + if (!e) + new = f2fs_kmem_cache_alloc(ino_entry_slab, + GFP_NOFS, true, NULL); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { - e = tmp; + if (!new) { + spin_unlock(&im->ino_lock); + goto retry; + } + e = new; if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) f2fs_bug_on(sbi, 1); @@ -486,8 +522,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, spin_unlock(&im->ino_lock); radix_tree_preload_end(); - if (e != tmp) - kmem_cache_free(ino_entry_slab, tmp); + if (new && e != new) + kmem_cache_free(ino_entry_slab, new); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -520,7 +556,7 @@ void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) __remove_ino_entry(sbi, ino, type); } -/* mode should be APPEND_INO or UPDATE_INO */ +/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */ bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { struct inode_management *im = &sbi->im[mode]; @@ -633,7 +669,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { iput(inode); goto err_out; @@ -644,7 +680,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - err = f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; @@ -685,9 +721,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. @@ -713,6 +746,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); @@ -846,6 +880,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, struct page *cp_page_1 = NULL, *cp_page_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; + unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, @@ -853,15 +888,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (err) return NULL; - if (le32_to_cpu(cp_block->cp_pack_total_block_count) > - sbi->blocks_per_seg) { + cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); + + if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; } pre_version = *version; - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) @@ -892,13 +928,13 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) int i; int err; - sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks), - GFP_KERNEL); + sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* * Finding out valid cp block involves read both - * sets( cp pack1 and cp pack 2) + * sets( cp pack 1 and cp pack 2) */ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); @@ -978,9 +1014,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -996,7 +1030,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type) stat_dec_dirty_inode(F2FS_I_SB(inode), type); } -void f2fs_update_dirty_page(struct inode *inode, struct page *page) +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -1011,8 +1045,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); + set_page_private_reference(&folio->page); } void f2fs_remove_dirty_inode(struct inode *inode) @@ -1032,7 +1065,8 @@ void f2fs_remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) { struct list_head *head; struct inode *inode; @@ -1044,8 +1078,12 @@ int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return -EIO; + } spin_lock(&sbi->inode_lock[type]); @@ -1063,11 +1101,15 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; - F2FS_I(inode)->cp_task = current; + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); - F2FS_I(inode)->cp_task = NULL; + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ @@ -1138,7 +1180,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) if (!is_journalled_quota(sbi)) return false; - down_write(&sbi->quota_sem); + if (!f2fs_down_write_trylock(&sbi->quota_sem)) + return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { @@ -1149,7 +1192,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { ret = true; } - up_write(&sbi->quota_sem); + f2fs_up_write(&sbi->quota_sem); return ret; } @@ -1163,10 +1206,12 @@ static int block_operations(struct f2fs_sb_info *sbi) .nr_to_write = LONG_MAX, .for_reclaim = 0, }; - struct blk_plug plug; int err = 0, cnt = 0; - blk_start_plug(&plug); + /* + * Let's flush inline_data in dirty node pages. + */ + f2fs_flush_inline_data(sbi); retry_flush_quotas: f2fs_lock_all(sbi); @@ -1193,9 +1238,9 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) - goto out; + return err; cond_resched(); goto retry_flush_quotas; } @@ -1204,30 +1249,30 @@ retry_flush_dents: * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. inode->i_blocks can be updated. */ - down_write(&sbi->node_change); + f2fs_down_write(&sbi->node_change); if (get_pages(sbi, F2FS_DIRTY_IMETA)) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) - goto out; + return err; cond_resched(); goto retry_flush_quotas; } retry_flush_nodes: - down_write(&sbi->node_write); + f2fs_down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); - goto out; + return err; } cond_resched(); goto retry_flush_nodes; @@ -1238,32 +1283,35 @@ retry_flush_nodes: * dirty node blocks and some checkpoint values by block allocation. */ __prepare_cp_block(sbi); - up_write(&sbi->node_change); -out: - blk_finish_plug(&plug); + f2fs_up_write(&sbi->node_change); return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); f2fs_unlock_all(sbi); } -void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) { DEFINE_WAIT(wait); for (;;) { - prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - - if (!get_pages(sbi, F2FS_WB_CP_DATA)) + if (!get_pages(sbi, type)) break; if (unlikely(f2fs_cp_error(sbi))) break; - io_schedule_timeout(5*HZ); + if (type == F2FS_DIRTY_META) + f2fs_sync_meta_pages(sbi, META, LONG_MAX, + FS_CP_META_IO); + else if (type == F2FS_WB_CP_DATA) + f2fs_submit_merged_write(sbi, DATA); + + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); } @@ -1274,12 +1322,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - spin_lock_irqsave(&sbi->cp_lock, flags); + if (cpc->reason & CP_UMOUNT) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) + + NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to no space"); + } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && + f2fs_nat_bitmap_enabled(sbi)) { + f2fs_enable_nat_bits(sbi); + set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Rebuild and enable nat_bits"); + } + } - if ((cpc->reason & CP_UMOUNT) && - le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) - disable_nat_bits(sbi, false); + spin_lock_irqsave(&sbi->cp_lock, flags); if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); @@ -1301,10 +1357,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || - is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + else + __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); else @@ -1367,6 +1427,26 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, f2fs_submit_merged_write(sbi, META_FLUSH); } +static inline u64 get_sectors_written(struct block_device *bdev) +{ + return (u64)part_stat_read(bdev, sectors[STAT_WRITE]); +} + +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) +{ + if (f2fs_is_multi_device(sbi)) { + u64 sectors = 0; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + sectors += get_sectors_written(FDEV(i).bdev); + + return sectors; + } + + return get_sectors_written(sbi->sb->s_bdev); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1377,20 +1457,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; int err; /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && - !f2fs_cp_error(sbi)); - /* - * modify checkpoint - * version number is already updated - */ + /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { @@ -1410,7 +1484,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - /* 2 cp + n data seg summary + orphan inode blocks */ + /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) @@ -1424,7 +1498,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) orphan_blocks); if (__remain_node_summaries(cpc->reason)) - ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); else @@ -1447,7 +1521,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if (enabled_nat_bits(sbi, cpc)) { + if ((cpc->reason & CP_UMOUNT) && + is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; @@ -1477,9 +1552,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Record write statistics in the hot node summary */ kbytes_written = sbi->kbytes_written; - if (sb->s_bdev->bd_part) - kbytes_written += BD_PART_WRITTEN(sbi); - + kbytes_written += (f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1; seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { @@ -1490,14 +1564,15 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; percpu_counter_set(&sbi->alloc_valid_block_count, 0); + percpu_counter_set(&sbi->rf_node_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && - !f2fs_cp_error(sbi)); + /* Wait for all dirty meta pages to be submitted for IO */ + f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); /* wait for previous submitted meta pages writeback */ - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); @@ -1506,13 +1581,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* * invalidate intermediate page cache borrowed from meta inode which are - * used for migration of encrypted or verity inode's blocks. + * used for migration of encrypted, verity or compressed inode's blocks. */ - if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi)) + if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || + f2fs_sb_has_compression(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); @@ -1543,9 +1619,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } -/* - * We guarantee that this checkpoint procedure will not fail. - */ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1560,7 +1633,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return 0; f2fs_warn(sbi, "Start checkpoint disabled!"); } - mutex_lock(&sbi->cp_mutex); + if (cpc->reason != CP_RESIZE) + f2fs_down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || @@ -1588,7 +1662,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; } - if (NM_I(sbi)->dirty_nat_cnt == 0 && + if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { f2fs_flush_sit_entries(sbi, cpc); @@ -1608,17 +1682,27 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write cached NAT/SIT entries to NAT/SIT area */ err = f2fs_flush_nat_entries(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; + } f2fs_flush_sit_entries(sbi, cpc); - /* unlock all the fs_lock[] in do_checkpoint() */ + /* save inmem log status */ + f2fs_save_inmem_curseg(sbi); + err = do_checkpoint(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); f2fs_release_discard_addrs(sbi); - else + } else { f2fs_clear_prefree_segments(sbi, cpc); + } + + f2fs_restore_inmem_curseg(sbi); stop: unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1626,11 +1710,12 @@ stop: if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); - /* do checkpoint periodically */ + /* update CP_TIME to trigger checkpoint periodically */ f2fs_update_time(sbi, CP_TIME); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: - mutex_unlock(&sbi->cp_mutex); + if (cpc->reason != CP_RESIZE) + f2fs_up_write(&sbi->cp_global_sem); return err; } @@ -1648,7 +1733,7 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) } sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - - NR_CURSEG_TYPE - __cp_payload(sbi)) * + NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; } @@ -1672,3 +1757,190 @@ void f2fs_destroy_checkpoint_caches(void) kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + f2fs_down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + f2fs_down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* + * update issue_list before we wake up issue_checkpoint thread, + * this smp_mb() pairs with another barrier in ___wait_event(), + * see more details in comments of waitqueue_active(). + */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; + + if (!cprc->f2fs_issue_ckpt) + return; + + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + f2fs_flush_ckpt_thread(sbi); +} + +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_IO_TIMEOUT); +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d8a64be90a50..d315c2de136f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -7,19 +7,53 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/moduleparam.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/lzo.h> #include <linux/lz4.h> +#include <linux/zstd.h> +#include <linux/pagevec.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include <trace/events/f2fs.h> +static struct kmem_cache *cic_entry_slab; +static struct kmem_cache *dic_entry_slab; + +static void *page_array_alloc(struct inode *inode, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (likely(size <= sbi->page_array_slab_size)) + return f2fs_kmem_cache_alloc(sbi->page_array_slab, + GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void page_array_free(struct inode *inode, void *pages, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (!pages) + return; + + if (likely(size <= sbi->page_array_slab_size)) + kmem_cache_free(sbi->page_array_slab, pages); + else + kfree(pages); +} + struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); int (*compress_pages)(struct compress_ctx *cc); + int (*init_decompress_ctx)(struct decompress_io_ctx *dic); + void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); int (*decompress_pages)(struct decompress_io_ctx *dic); }; @@ -44,33 +78,22 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (!page_private(page)) return false; - if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) + if (page_private_nonpointer(page)) return false; + f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); return true; } static void f2fs_set_compressed_page(struct page *page, - struct inode *inode, pgoff_t index, void *data, refcount_t *r) + struct inode *inode, pgoff_t index, void *data) { - SetPagePrivate(page); - set_page_private(page, (unsigned long)data); + attach_page_private(page, (void *)data); /* i_crypto_info and iv index */ page->index = index; page->mapping = inode->i_mapping; - if (r) - refcount_inc(r); -} - -static void f2fs_put_compressed_page(struct page *page) -{ - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); - page->mapping = NULL; - unlock_page(page); - put_page(page); } static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) @@ -97,20 +120,6 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) f2fs_drop_rpages(cc, len, true); } -static void f2fs_put_rpages_mapping(struct compress_ctx *cc, - struct address_space *mapping, - pgoff_t start, int len) -{ - int i; - - for (i = 0; i < len; i++) { - struct page *page = find_get_page(mapping, start + i); - - put_page(page); - put_page(page); - } -} - static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { @@ -132,23 +141,22 @@ struct page *f2fs_compress_control_page(struct page *page) int f2fs_init_compress_ctx(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); - - if (cc->nr_rpages) + if (cc->rpages) return 0; - cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } -void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { - kfree(cc->rpages); + page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; - cc->cluster_idx = NULL_CLUSTER; + cc->valid_nr_cpages = 0; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) @@ -230,12 +238,23 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = { #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZ4_MEM_COMPRESS, GFP_NOFS); + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; - cc->clen = LZ4_compressBound(PAGE_SIZE << cc->log_cluster_size); + /* + * we do not change cc->clen to LZ4_compressBound(inputsize) to + * adapt worst compress case, because lz4 compressor can handle + * output budget properly. + */ + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; return 0; } @@ -245,17 +264,39 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + static int lz4_compress_pages(struct compress_ctx *cc) { int len; +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); - if (!len) { - printk_ratelimited("%sF2FS-fs (%s): lz4 compress failed\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id); - return -EIO; - } + if (!len) + return -EAGAIN; + cc->clen = len; return 0; } @@ -273,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) } if (ret != PAGE_SIZE << dic->log_cluster_size) { - printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, " + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, " "expected:%lu\n", KERN_ERR, - F2FS_I_SB(dic->inode)->sb->s_id, - dic->rlen, + F2FS_I_SB(dic->inode)->sb->s_id, ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; } @@ -291,6 +331,203 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = { }; #endif +#ifdef CONFIG_F2FS_FS_ZSTD +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + +static int zstd_init_compress_ctx(struct compress_ctx *cc) +{ + zstd_parameters params; + zstd_cstream *stream; + void *workspace; + unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen); + workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); + + workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = zstd_init_cstream(¶ms, 0, workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_cstream failed\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + cc->private = workspace; + cc->private2 = stream; + + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; + return 0; +} + +static void zstd_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; + cc->private2 = NULL; +} + +static int zstd_compress_pages(struct compress_ctx *cc) +{ + zstd_cstream *stream = cc->private2; + zstd_in_buffer inbuf; + zstd_out_buffer outbuf; + int src_size = cc->rlen; + int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE; + int ret; + + inbuf.pos = 0; + inbuf.src = cc->rbuf; + inbuf.size = src_size; + + outbuf.pos = 0; + outbuf.dst = cc->cbuf->cdata; + outbuf.size = dst_size; + + ret = zstd_compress_stream(stream, &outbuf, &inbuf); + if (zstd_is_error(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_compress_stream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + ret = zstd_end_stream(stream, &outbuf); + if (zstd_is_error(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_end_stream returned %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + /* + * there is compressed data remained in intermediate buffer due to + * no more space in cbuf.cdata + */ + if (ret) + return -EAGAIN; + + cc->clen = outbuf.pos; + return 0; +} + +static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) +{ + zstd_dstream *stream; + void *workspace; + unsigned int workspace_size; + unsigned int max_window_size = + MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); + + workspace_size = zstd_dstream_workspace_bound(max_window_size); + + workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = zstd_init_dstream(max_window_size, workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_dstream failed\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + dic->private = workspace; + dic->private2 = stream; + + return 0; +} + +static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) +{ + kvfree(dic->private); + dic->private = NULL; + dic->private2 = NULL; +} + +static int zstd_decompress_pages(struct decompress_io_ctx *dic) +{ + zstd_dstream *stream = dic->private2; + zstd_in_buffer inbuf; + zstd_out_buffer outbuf; + int ret; + + inbuf.pos = 0; + inbuf.src = dic->cbuf->cdata; + inbuf.size = dic->clen; + + outbuf.pos = 0; + outbuf.dst = dic->rbuf; + outbuf.size = dic->rlen; + + ret = zstd_decompress_stream(stream, &outbuf, &inbuf); + if (zstd_is_error(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s zstd_decompress_stream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + if (dic->rlen != outbuf.pos) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + __func__, dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + + return 0; +} + +static const struct f2fs_compress_ops f2fs_zstd_ops = { + .init_compress_ctx = zstd_init_compress_ctx, + .destroy_compress_ctx = zstd_destroy_compress_ctx, + .compress_pages = zstd_compress_pages, + .init_decompress_ctx = zstd_init_decompress_ctx, + .destroy_decompress_ctx = zstd_destroy_decompress_ctx, + .decompress_pages = zstd_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZO +#ifdef CONFIG_F2FS_FS_LZORLE +static int lzorle_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzorle_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzorle_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif +#endif + static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #ifdef CONFIG_F2FS_FS_LZO &f2fs_lzo_ops, @@ -302,6 +539,16 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #else NULL, #endif +#ifdef CONFIG_F2FS_FS_ZSTD + &f2fs_zstd_ops, +#else + NULL, +#endif +#if defined(CONFIG_F2FS_FS_LZO) && defined(CONFIG_F2FS_FS_LZORLE) + &f2fs_lzorle_ops, +#else + NULL, +#endif }; bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -311,58 +558,105 @@ bool f2fs_is_compress_backend_ready(struct inode *inode) return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; } -static struct page *f2fs_grab_page(void) +static mempool_t *compress_page_pool; +static int num_compress_pages = 512; +module_param(num_compress_pages, uint, 0444); +MODULE_PARM_DESC(num_compress_pages, + "Number of intermediate compress pages to preallocate"); + +int f2fs_init_compress_mempool(void) +{ + compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); + if (!compress_page_pool) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_compress_mempool(void) +{ + mempool_destroy(compress_page_pool); +} + +static struct page *f2fs_compress_alloc_page(void) { struct page *page; - page = alloc_page(GFP_NOFS); - if (!page) - return NULL; + page = mempool_alloc(compress_page_pool, GFP_NOFS); lock_page(page); + return page; } +static void f2fs_compress_free_page(struct page *page) +{ + if (!page) + return; + detach_page_private(page); + page->mapping = NULL; + unlock_page(page); + mempool_free(page, compress_page_pool); +} + +#define MAX_VMAP_RETRIES 3 + +static void *f2fs_vmap(struct page **pages, unsigned int count) +{ + int i; + void *buf = NULL; + + for (i = 0; i < MAX_VMAP_RETRIES; i++) { + buf = vm_map_ram(pages, count, -1); + if (buf) + break; + vm_unmap_aliases(); + } + return buf; +} + static int f2fs_compress_pages(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; - unsigned int max_len, nr_cpages; + unsigned int max_len, new_nr_cpages; + u32 chksum = 0; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, cc->cluster_size, fi->i_compress_algorithm); - ret = cops->init_compress_ctx(cc); - if (ret) - goto out; + if (cops->init_compress_ctx) { + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + } max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; - cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - cc->nr_cpages, GFP_NOFS); + cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; } for (i = 0; i < cc->nr_cpages; i++) { - cc->cpages[i] = f2fs_grab_page(); + cc->cpages[i] = f2fs_compress_alloc_page(); if (!cc->cpages[i]) { ret = -ENOMEM; goto out_free_cpages; } } - cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO); + cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { ret = -ENOMEM; goto out_free_cpages; } - cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL); + cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); if (!cc->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; @@ -380,83 +674,87 @@ static int f2fs_compress_pages(struct compress_ctx *cc) } cc->cbuf->clen = cpu_to_le32(cc->clen); - cc->cbuf->chksum = cpu_to_le32(0); + + if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM) + chksum = f2fs_crc32(F2FS_I_SB(cc->inode), + cc->cbuf->cdata, cc->clen); + cc->cbuf->chksum = cpu_to_le32(chksum); for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); - vunmap(cc->cbuf); - vunmap(cc->rbuf); + new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* zero out any unused part of the last page */ + memset(&cc->cbuf->cdata[cc->clen], 0, + (new_nr_cpages * PAGE_SIZE) - + (cc->clen + COMPRESS_HEADER_SIZE)); - nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); + vm_unmap_ram(cc->rbuf, cc->cluster_size); - for (i = nr_cpages; i < cc->nr_cpages; i++) { - f2fs_put_compressed_page(cc->cpages[i]); + for (i = 0; i < cc->nr_cpages; i++) { + if (i < new_nr_cpages) + continue; + f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } - cc->nr_cpages = nr_cpages; + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); + + cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return 0; out_vunmap_cbuf: - vunmap(cc->cbuf); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); out_vunmap_rbuf: - vunmap(cc->rbuf); + vm_unmap_ram(cc->rbuf, cc->cluster_size); out_free_cpages: for (i = 0; i < cc->nr_cpages; i++) { if (cc->cpages[i]) - f2fs_put_compressed_page(cc->cpages[i]); + f2fs_compress_free_page(cc->cpages[i]); } - kfree(cc->cpages); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: - cops->destroy_compress_ctx(cc); + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); out: trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return ret; } -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc); +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc); + +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi= F2FS_I(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; + bool bypass_callback = false; int ret; - dec_page_count(sbi, F2FS_RD_DATA); - - if (bio->bi_status || PageError(page)) - dic->failed = true; - - if (refcount_dec_not_one(&dic->ref)) - return; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); - /* submit partial compressed pages */ if (dic->failed) { ret = -EIO; - goto out_free_dic; - } - - dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); - if (!dic->rbuf) { - ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } - dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); - if (!dic->cbuf) { - ret = -ENOMEM; - goto out_vunmap_rbuf; + ret = f2fs_prepare_decomp_mem(dic, false); + if (ret) { + bypass_callback = true; + goto out_release; } dic->clen = le32_to_cpu(dic->cbuf->clen); @@ -464,24 +762,60 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - goto out_vunmap_cbuf; + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); + goto out_release; } ret = cops->decompress_pages(dic); -out_vunmap_cbuf: - vunmap(dic->cbuf); -out_vunmap_rbuf: - vunmap(dic->rbuf); -out_free_dic: - if (!verity) - f2fs_decompress_end_io(dic->rpages, dic->cluster_size, - ret, false); + if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) { + u32 provided = le32_to_cpu(dic->cbuf->chksum); + u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); + + if (provided != calculated) { + if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { + set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); + printk_ratelimited( + "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x", + KERN_INFO, sbi->sb->s_id, dic->inode->i_ino, + provided, calculated); + } + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } +out_release: + f2fs_release_decomp_mem(dic, bypass_callback, false); + +out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - if (!verity) - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret, in_task); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr, bool in_task) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + else if (blkaddr && in_task) + f2fs_cache_compressed_page(sbi, page, + dic->inode->i_ino, blkaddr); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic, in_task); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -508,9 +842,34 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } -static bool __cluster_may_compress(struct compress_ctx *cc) +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate) +{ + unsigned long pgidx = pages[index]->index; + int i = uptodate ? 0 : 1; + + /* + * when uptodate set to true, try to check all pages in cluster is + * uptodate or not. + */ + if (uptodate && (pgidx % cc->cluster_size)) + return false; + + if (nr_pages - index < cc->cluster_size) + return false; + + for (; i < cc->cluster_size; i++) { + if (pages[index + i]->index != pgidx + i) + return false; + if (uptodate && !PageUptodate(pages[index + i])) + return false; + } + + return true; +} + +static bool cluster_has_invalid_data(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); int i; @@ -518,76 +877,136 @@ static bool __cluster_may_compress(struct compress_ctx *cc) for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; - f2fs_bug_on(sbi, !page); - - if (unlikely(f2fs_cp_error(sbi))) - return false; - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - return false; + f2fs_bug_on(F2FS_I_SB(cc->inode), !page); /* beyond EOF */ if (page->index >= nr_pages) - return false; + return true; + } + return false; +} + +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool compressed = dn->data_blkaddr == COMPRESS_ADDR; + int cluster_end = 0; + int i; + char *reason = ""; + + if (!compressed) + return false; + + /* [..., COMPR_ADDR, ...] */ + if (dn->ofs_in_node % cluster_size) { + reason = "[*|C|*|*]"; + goto out; + } + + for (i = 1; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + /* [COMPR_ADDR, ..., COMPR_ADDR] */ + if (blkaddr == COMPRESS_ADDR) { + reason = "[C|*|C|*]"; + goto out; + } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; + } } + return false; +out: + f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); + set_sbi_flag(sbi, SBI_NEED_FSCK); return true; } -/* return # of compressed block addresses */ -static int f2fs_compressed_blocks(struct compress_ctx *cc) +static int __f2fs_cluster_blocks(struct inode *inode, + unsigned int cluster_idx, bool compr) { struct dnode_of_data dn; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int start_idx = cluster_idx << + F2FS_I(inode)->i_log_cluster_size; int ret; - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc), - LOOKUP_NODE); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) { if (ret == -ENOENT) ret = 0; goto fail; } + if (f2fs_sanity_check_cluster(&dn)) { + ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); + goto fail; + } + if (dn.data_blkaddr == COMPRESS_ADDR) { int i; ret = 1; - for (i = 1; i < cc->cluster_size; i++) { + for (i = 1; i < cluster_size; i++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, + blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i); - if (blkaddr != NULL_ADDR) - ret++; + if (compr) { + if (__is_valid_data_blkaddr(blkaddr)) + ret++; + } else { + if (blkaddr != NULL_ADDR) + ret++; + } } + + f2fs_bug_on(F2FS_I_SB(inode), + !compr && ret != cluster_size && + !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)); } fail: f2fs_put_dnode(&dn); return ret; } -int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) +/* return # of compressed blocks in compressed cluster */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) { - struct compress_ctx cc = { - .inode = inode, - .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, - .cluster_size = F2FS_I(inode)->i_cluster_size, - .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, - }; + return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); +} - return f2fs_compressed_blocks(&cc); +/* return # of valid blocks in compressed cluster */ +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) +{ + return __f2fs_cluster_blocks(inode, + index >> F2FS_I(inode)->i_log_cluster_size, + false); } static bool cluster_may_compress(struct compress_ctx *cc) { - if (!f2fs_compressed_file(cc->inode)) + if (!f2fs_need_compress_data(cc->inode)) return false; if (f2fs_is_atomic_file(cc->inode)) return false; - if (f2fs_is_mmap_file(cc->inode)) - return false; if (!f2fs_cluster_is_full(cc)) return false; - return __cluster_may_compress(cc); + if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) + return false; + return !cluster_has_invalid_data(cc); } static void set_cluster_writeback(struct compress_ctx *cc) @@ -615,21 +1034,16 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct address_space *mapping = cc->inode->i_mapping; struct page *page; - struct dnode_of_data dn; sector_t last_block_in_bio; unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; - bool prealloc; retry: - ret = f2fs_compressed_blocks(cc); + ret = f2fs_is_compressed_cluster(cc->inode, start_idx); if (ret <= 0) return ret; - /* compressed case */ - prealloc = (ret < cc->cluster_size); - ret = f2fs_init_compress_ctx(cc); if (ret) return ret; @@ -644,7 +1058,7 @@ retry: } if (PageUptodate(page)) - unlock_page(page); + f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page); } @@ -653,57 +1067,40 @@ retry: struct bio *bio = NULL; ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, - &last_block_in_bio, false); - f2fs_destroy_compress_ctx(cc); + &last_block_in_bio, false, true); + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc, true); if (ret) - goto release_pages; + goto out; if (bio) f2fs_submit_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) - goto release_pages; + goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); - f2fs_bug_on(sbi, !page); + if (!page) { + /* page can be truncated */ + goto release_and_retry; + } f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_compress_ctx_add_page(cc, page); - f2fs_put_page(page, 0); if (!PageUptodate(page)) { +release_and_retry: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_put_rpages_mapping(cc, mapping, start_idx, - cc->cluster_size); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); goto retry; } } - if (prealloc) { - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); - - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - - for (i = cc->cluster_size - 1; i > 0; i--) { - ret = f2fs_get_block(&dn, start_idx + i); - if (ret) { - i = cc->cluster_size; - break; - } - - if (dn.data_blkaddr != NEW_ADDR) - break; - } - - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - } - if (likely(!ret)) { *fsdata = cc->rpages; *pagep = cc->rpages[offset_in_cluster(cc, index)]; @@ -711,10 +1108,10 @@ retry: } unlock_pages: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); -release_pages: - f2fs_put_rpages_mapping(cc, mapping, start_idx, i); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); +out: return ret; } @@ -738,6 +1135,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, { struct compress_ctx cc = { + .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .rpages = fsdata, @@ -748,11 +1146,60 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); return first_index; } +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) +{ + void *fsdata = NULL; + struct page *pagep; + int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << + log_cluster_size; + int err; + + err = f2fs_is_compressed_cluster(inode, start_idx); + if (err < 0) + return err; + + /* truncate normal cluster */ + if (!err) + return f2fs_do_truncate_blocks(inode, from, lock); + + /* truncate compressed cluster */ + err = f2fs_prepare_compress_overwrite(inode, &pagep, + start_idx, &fsdata); + + /* should not be a normal cluster */ + f2fs_bug_on(F2FS_I_SB(inode), err == 0); + + if (err <= 0) + return err; + + if (err > 0) { + struct page **rpages = fsdata; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int i; + + for (i = cluster_size - 1; i >= 0; i--) { + loff_t start = rpages[i]->index << PAGE_SHIFT; + + if (from <= start) { + zero_user_segment(rpages[i], 0, PAGE_SIZE); + } else { + zero_user_segment(rpages[i], from - start, + PAGE_SIZE); + break; + } + } + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + } + return 0; +} + static int f2fs_write_compressed_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, @@ -772,10 +1219,9 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, .encrypted_page = NULL, .compressed_page = NULL, .submitted = false, - .need_lock = LOCK_RETRY, .io_type = io_type, .io_wbc = wbc, - .encrypted = f2fs_encrypted_file(cc->inode), + .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode), }; struct dnode_of_data dn; struct node_info ni; @@ -785,47 +1231,67 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(cc->rpages[0]->mapping, -EIO); + goto out_free; + } - f2fs_lock_op(sbi); + if (IS_NOQUOTA(inode)) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + f2fs_down_read(&sbi->node_write); + } else if (!f2fs_trylock_op(sbi)) { + goto out_free; + } + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (err) goto out_unlock_op; for (i = 0; i < cc->cluster_size; i++) { - if (datablock_addr(dn.inode, dn.node_page, + if (data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) == NULL_ADDR) goto out_put_dnode; } psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; fio.version = ni.version; - cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS); + cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!cic) goto out_put_dnode; cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - refcount_set(&cic->ref, 1); - cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); + cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, - cc->rpages[i + 1]->index, - cic, i ? &cic->ref : NULL); + cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; + + fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, fio.old_blkaddr); + if (fio.encrypted) { fio.page = cc->rpages[i + 1]; err = f2fs_encrypt_one_page(&fio); @@ -843,9 +1309,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, - dn.ofs_in_node); - fio.page = cic->rpages[i]; + blkaddr = f2fs_data_blkaddr(&dn); + fio.page = cc->rpages[i]; fio.old_blkaddr = blkaddr; /* cluster header */ @@ -861,7 +1326,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; - if (i > cc->nr_cpages) { + if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -886,40 +1351,51 @@ unlock_continue: if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); - f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + if (IS_NOQUOTA(inode)) + f2fs_up_read(&sbi->node_write); + else + f2fs_unlock_op(sbi); - down_write(&fi->i_sem); + spin_lock(&fi->i_size_lock); if (fi->last_disk_size < psize) fi->last_disk_size = psize; - up_write(&fi->i_sem); + spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); - f2fs_destroy_compress_ctx(cc); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; + f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: - kfree(cic->rpages); + page_array_free(cc->inode, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; - f2fs_put_page(cc->cpages[i], 1); - } out_put_cic: - kfree(cic); + kmem_cache_free(cic_entry_slab, cic); out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: - f2fs_unlock_op(sbi); + if (IS_NOQUOTA(inode)) + f2fs_up_read(&sbi->node_write); + else + f2fs_unlock_op(sbi); +out_free: + for (i = 0; i < cc->valid_nr_cpages; i++) { + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; return -EAGAIN; } @@ -933,21 +1409,21 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) if (unlikely(bio->bi_status)) mapping_set_error(cic->inode->i_mapping, -EIO); - f2fs_put_compressed_page(page); + f2fs_compress_free_page(page); dec_page_count(sbi, F2FS_WB_DATA); - if (refcount_dec_not_one(&cic->ref)) + if (atomic_dec_return(&cic->pending_pages)) return; for (i = 0; i < cic->nr_rpages; i++) { WARN_ON(!cic->rpages[i]); - clear_cold_data(cic->rpages[i]); + clear_page_private_gcing(cic->rpages[i]); end_page_writeback(cic->rpages[i]); } - kfree(cic->rpages); - kfree(cic); + page_array_free(cic->inode, cic->rpages, cic->nr_rpages); + kmem_cache_free(cic_entry_slab, cic); } static int f2fs_write_raw_pages(struct compress_ctx *cc, @@ -956,60 +1432,67 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; - int _submitted, compr_blocks, ret; - int i = -1, err = 0; + int _submitted, compr_blocks, ret, i; compr_blocks = f2fs_compressed_blocks(cc); - if (compr_blocks < 0) { - err = compr_blocks; - goto out_err; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); } + if (compr_blocks < 0) + return compr_blocks; + for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; retry_write: + lock_page(cc->rpages[i]); + if (cc->rpages[i]->mapping != mapping) { +continue_unlock: unlock_page(cc->rpages[i]); continue; } - BUG_ON(!PageLocked(cc->rpages[i])); + if (!PageDirty(cc->rpages[i])) + goto continue_unlock; + + if (!clear_page_dirty_for_io(cc->rpages[i])) + goto continue_unlock; ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, - compr_blocks); + compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); ret = 0; } else if (ret == -EAGAIN) { + /* + * for quota file, just redirty left pages to + * avoid deadlock caused by cluster update race + * from foreground operation. + */ + if (IS_NOQUOTA(cc->inode)) + return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); - lock_page(cc->rpages[i]); - clear_page_dirty_for_io(cc->rpages[i]); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } - err = ret; - goto out_fail; + return ret; } *submitted += _submitted; } - return 0; -out_fail: - /* TODO: revoke partially updated block addresses */ - BUG_ON(compr_blocks); -out_err: - for (++i; i < cc->cluster_size; i++) { - if (!cc->rpages[i]) - continue; - redirty_page_for_writepage(wbc, cc->rpages[i]); - unlock_page(cc->rpages[i]); - } - return err; + f2fs_balance_fs(F2FS_M_SB(mapping), true); + + return 0; } int f2fs_write_multi_pages(struct compress_ctx *cc, @@ -1017,15 +1500,13 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type) { - struct f2fs_inode_info *fi = F2FS_I(cc->inode); - const struct f2fs_compress_ops *cops = - f2fs_cops[fi->i_compress_algorithm]; int err; *submitted = 0; if (cluster_may_compress(cc)) { err = f2fs_compress_pages(cc); if (err == -EAGAIN) { + add_compr_block_stat(cc->inode, cc->cluster_size); goto write; } else if (err) { f2fs_put_rpages_wbc(cc, wbc, true, 1); @@ -1034,7 +1515,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); - cops->destroy_compress_ctx(cc); if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); @@ -1045,132 +1525,521 @@ write: err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return err; } +static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, + bool pre_alloc) +{ + return pre_alloc ^ f2fs_low_mem_mode(sbi); +} + +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + int i; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return 0; + + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + if (!dic->tpages) + return -ENOMEM; + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + if (!dic->tpages[i]) + return -ENOMEM; + } + + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) + return -ENOMEM; + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) + return -ENOMEM; + + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); + + return 0; +} + +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return; + + if (!bypass_destroy_callback && cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + + if (dic->cbuf) + vm_unmap_ram(dic->cbuf, dic->nr_cpages); + + if (dic->rbuf) + vm_unmap_ram(dic->rbuf, dic->cluster_size); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); - int i; + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + int i, ret; - dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!dic) return ERR_PTR(-ENOMEM); - dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!dic->rpages) { - kfree(dic); + kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); } dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - refcount_set(&dic->ref, 1); + atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; - dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->nr_cpages, GFP_NOFS); - if (!dic->cpages) + dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); + if (!dic->cpages) { + ret = -ENOMEM; goto out_free; + } for (i = 0; i < dic->nr_cpages; i++) { struct page *page; - page = f2fs_grab_page(); - if (!page) + page = f2fs_compress_alloc_page(); + if (!page) { + ret = -ENOMEM; goto out_free; + } f2fs_set_compressed_page(page, cc->inode, - start_idx + i + 1, - dic, i ? &dic->ref : NULL); + start_idx + i + 1, dic); dic->cpages[i] = page; } - dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->cluster_size, GFP_NOFS); - if (!dic->tpages) + ret = f2fs_prepare_decomp_mem(dic, true); + if (ret) goto out_free; - for (i = 0; i < dic->cluster_size; i++) { - if (cc->rpages[i]) - continue; - - dic->tpages[i] = f2fs_grab_page(); - if (!dic->tpages[i]) - goto out_free; - } - - for (i = 0; i < dic->cluster_size; i++) { - if (dic->tpages[i]) - continue; - dic->tpages[i] = cc->rpages[i]; - } - return dic; out_free: - f2fs_free_dic(dic); - return ERR_PTR(-ENOMEM); + f2fs_free_dic(dic, true); + return ERR_PTR(ret); } -void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback) { int i; + f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); + if (dic->tpages) { for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) continue; - f2fs_put_page(dic->tpages[i], 1); + if (!dic->tpages[i]) + continue; + f2fs_compress_free_page(dic->tpages[i]); } - kfree(dic->tpages); + page_array_free(dic->inode, dic->tpages, dic->cluster_size); } if (dic->cpages) { for (i = 0; i < dic->nr_cpages; i++) { if (!dic->cpages[i]) continue; - f2fs_put_compressed_page(dic->cpages[i]); + f2fs_compress_free_page(dic->cpages[i]); } - kfree(dic->cpages); + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); } - kfree(dic->rpages); - kfree(dic); + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); } -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity) +static void f2fs_late_free_dic(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, free_work); + + f2fs_free_dic(dic, false); +} + +static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) +{ + if (refcount_dec_and_test(&dic->refcnt)) { + if (in_task) { + f2fs_free_dic(dic, false); + } else { + INIT_WORK(&dic->free_work, f2fs_late_free_dic); + queue_work(F2FS_I_SB(dic->inode)->post_read_wq, + &dic->free_work); + } + } +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { int i; - for (i = 0; i < cluster_size; i++) { - struct page *rpage = rpages[i]; + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; if (!rpage) continue; - if (err || PageError(rpage)) { + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { ClearPageUptodate(rpage); + /* will re-read again later */ ClearPageError(rpage); } else { - if (!verity || fsverity_verify_page(rpage)) - SetPageUptodate(rpage); - else - SetPageError(rpage); + SetPageUptodate(rpage); } unlock_page(rpage); } + + f2fs_put_dic(dic, in_task); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false, true); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed, in_task); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page, bool in_task) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic, in_task); +} + +/* + * check whether cluster blocks are contiguous, and add extent cache entry + * only if cluster blocks are logically and physically contiguous. + */ +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) +{ + bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR; + int i = compressed ? 1 : 0; + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) + return 0; + } + + return compressed ? i - 1 : i; +} + +const struct address_space_operations f2fs_compress_aops = { + .release_folio = f2fs_release_folio, + .invalidate_folio = f2fs_invalidate_folio, +}; + +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->compress_inode->i_mapping; +} + +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + if (!sbi->compress_inode) + return; + invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr); +} + +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr) +{ + struct page *cpage; + int ret; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + return; + + if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) + return; + + cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_put_page(cpage, 0); + return; + } + + cpage = alloc_page(__GFP_NOWARN | __GFP_IO); + if (!cpage) + return; + + ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), + blkaddr, GFP_NOFS); + if (ret) { + f2fs_put_page(cpage, 0); + return; + } + + set_page_private_data(cpage, ino); + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + goto out; + + memcpy(page_address(cpage), page_address(page), PAGE_SIZE); + SetPageUptodate(cpage); +out: + f2fs_put_page(cpage, 1); +} + +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr) +{ + struct page *cpage; + bool hitted = false; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return false; + + cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), + blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); + if (cpage) { + if (PageUptodate(cpage)) { + atomic_inc(&sbi->compress_page_hit); + memcpy(page_address(page), + page_address(cpage), PAGE_SIZE); + hitted = true; + } + f2fs_put_page(cpage, 1); + } + + return hitted; +} + +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct address_space *mapping = COMPRESS_MAPPING(sbi); + struct folio_batch fbatch; + pgoff_t index = 0; + pgoff_t end = MAX_BLKADDR(sbi); + + if (!mapping->nrpages) + return; + + folio_batch_init(&fbatch); + + do { + unsigned int nr, i; + + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); + if (!nr) + break; + + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + continue; + } + + if (ino != get_page_private_data(&folio->page)) { + folio_unlock(folio); + continue; + } + + generic_error_remove_page(mapping, &folio->page); + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } while (index < end); +} + +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) +{ + struct inode *inode; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return 0; + + inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->compress_inode = inode; + + sbi->compress_percent = COMPRESS_PERCENT; + sbi->compress_watermark = COMPRESS_WATERMARK; + + atomic_set(&sbi->compress_page_hit, 0); + + return 0; +} + +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) +{ + if (!sbi->compress_inode) + return; + iput(sbi->compress_inode); + sbi->compress_inode = NULL; +} + +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + if (!f2fs_sb_has_compression(sbi)) + return 0; + + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->page_array_slab_size = sizeof(struct page *) << + F2FS_OPTION(sbi).compress_log_size; + + sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, + sbi->page_array_slab_size); + if (!sbi->page_array_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->page_array_slab); +} + +static int __init f2fs_init_cic_cache(void) +{ + cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", + sizeof(struct compress_io_ctx)); + if (!cic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_cic_cache(void) +{ + kmem_cache_destroy(cic_entry_slab); +} + +static int __init f2fs_init_dic_cache(void) +{ + dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", + sizeof(struct decompress_io_ctx)); + if (!dic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_dic_cache(void) +{ + kmem_cache_destroy(dic_entry_slab); +} + +int __init f2fs_init_compress_cache(void) +{ + int err; + + err = f2fs_init_cic_cache(); + if (err) + goto out; + err = f2fs_init_dic_cache(); + if (err) + goto free_cic; + return 0; +free_cic: + f2fs_destroy_cic_cache(); +out: + return -ENOMEM; +} + +void f2fs_destroy_compress_cache(void) +{ + f2fs_destroy_dic_cache(); + f2fs_destroy_cic_cache(); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b27b72107911..a71e818cd67b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -8,22 +8,24 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> +#include <linux/sched/mm.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/blk-crypto.h> #include <linux/swap.h> #include <linux/prefetch.h> #include <linux/uio.h> -#include <linux/cleancache.h> #include <linux/sched/signal.h> +#include <linux/fiemap.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" +#include "iostat.h" #include <trace/events/f2fs.h> #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -48,31 +50,6 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, - unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); -} - -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail) -{ - struct bio *bio; - - if (no_fail) { - /* No failure on bio allocation */ - bio = __f2fs_bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; - } - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return __f2fs_bio_alloc(GFP_KERNEL, npages); -} - static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -82,18 +59,18 @@ static bool __is_cp_guaranteed(struct page *page) if (!mapping) return false; - if (f2fs_is_compressed_page(page)) - return false; - inode = mapping->host; sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || - inode->i_ino == F2FS_NODE_INO(sbi) || - S_ISDIR(inode->i_mode) || - (S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || - is_cold_data(page)) + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode)) + return true; + + if (f2fs_is_compressed_page(page)) + return false; + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || + page_private_gcing(page)) return true; return false; } @@ -117,9 +94,21 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_DECRYPT, - STEP_DECOMPRESS, - STEP_VERITY, +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = 1 << 0, +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = 1 << 1, +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = 1 << 2, +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif }; struct bio_post_read_ctx { @@ -127,25 +116,30 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + block_t fs_blkaddr; }; -static void __read_end_io(struct bio *bio, bool compr, bool verity) +static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { - struct page *page; struct bio_vec *bv; struct bvec_iter_all iter_all; + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + struct page *page = bv->bv_page; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (compr && f2fs_is_compressed_page(page)) { - f2fs_decompress_pages(bio, page, verity); + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true, 0, + in_task); + f2fs_put_page_dic(page, in_task); continue; } -#endif - /* PG_error was set if any post_read step failed */ + /* PG_error was set if verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -156,163 +150,176 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } -} - -static void f2fs_release_read_bio(struct bio *bio); -static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) -{ - if (!compr) - __read_end_io(bio, false, verity); - f2fs_release_read_bio(bio); -} -static void f2fs_decompress_bio(struct bio *bio, bool verity) -{ - __read_end_io(bio, true, verity); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx); - -static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) -{ - fscrypt_decrypt_bio(ctx->bio); -} - -static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) -{ - f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); -} - -#ifdef CONFIG_F2FS_FS_COMPRESSION -static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) -{ - f2fs_decompress_end_io(rpages, cluster_size, false, true); -} - -static void f2fs_verify_bio(struct bio *bio) -{ - struct page *page = bio_first_page_all(bio); - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); - - f2fs_verify_pages(dic->rpages, dic->cluster_size); - f2fs_free_dic(dic); + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); } -#endif -static void f2fs_verity_work(struct work_struct *work) +static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; -#ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int enabled_steps = ctx->enabled_steps; -#endif + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* - * fsverity_verify_bio() may call readpages() again, and while verity - * will be disabled for this, decryption may still be needed, resulting - * in another bio_post_read_ctx being allocated. So to prevent - * deadlocks we need to release the current ctx to the mempool first. - * This assumes that verity is the last post-read step. + * fsverity_verify_bio() may call readahead() again, and while verity + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; -#ifdef CONFIG_F2FS_FS_COMPRESSION - /* previous step is decompression */ - if (enabled_steps & (1 << STEP_DECOMPRESS)) { - f2fs_verify_bio(bio); - f2fs_release_read_bio(bio); - return; + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); } -#endif - fsverity_verify_bio(bio); - __f2fs_read_end_io(bio, false, false); + f2fs_finish_read_bio(bio, true); } -static void f2fs_post_read_work(struct work_struct *work) +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { - struct bio_post_read_ctx *ctx = - container_of(work, struct bio_post_read_ctx, work); - - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) - f2fs_decrypt_work(ctx); + struct bio_post_read_ctx *ctx = bio->bi_private; - if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) - f2fs_decompress_work(ctx); - - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); - return; + } else { + f2fs_finish_read_bio(bio, in_task); } - - __f2fs_read_end_io(ctx->bio, - ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); } -static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, - struct work_struct *work) +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, + bool in_task) { - queue_work(sbi->post_read_wq, work); -} + struct bio_vec *bv; + struct bvec_iter_all iter_all; + bool all_compressed = true; + block_t blkaddr = ctx->fs_blkaddr; + + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; + + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, false, blkaddr, + in_task); + else + all_compressed = false; + + blkaddr++; + } -static void bio_post_read_processing(struct bio_post_read_ctx *ctx) -{ /* - * We use different work queues for decryption and for verity because - * verity may require reading metadata pages that need decryption, and - * we shouldn't recurse to the same workqueue. + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; +} - if (ctx->enabled_steps & (1 << STEP_DECRYPT) || - ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); - return; - } +static void f2fs_post_read_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); + if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { + f2fs_finish_read_bio(bio, true); return; } - __f2fs_read_end_io(ctx->bio, false, false); -} + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx, true); -static bool f2fs_bio_post_read_required(struct bio *bio) -{ - return bio->bi_private; + f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx; + bool intask = in_task(); + + iostat_update_and_unbind_ctx(bio, 0); + ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } - if (f2fs_bio_post_read_required(bio)) { - struct bio_post_read_ctx *ctx = bio->bi_private; - - bio_post_read_processing(ctx); + if (bio->bi_status) { + f2fs_finish_read_bio(bio, intask); return; } - __f2fs_read_end_io(bio, false, false); + if (ctx) { + unsigned int enabled_steps = ctx->enabled_steps & + (STEP_DECRYPT | STEP_DECOMPRESS); + + /* + * If we have only decompression step between decompression and + * decrypt, we don't need post processing for this. + */ + if (enabled_steps == STEP_DECOMPRESS && + !f2fs_low_mem_mode(sbi)) { + f2fs_handle_step_decompress(ctx, intask); + } else if (enabled_steps) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + return; + } + } + + f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) { - struct f2fs_sb_info *sbi = bio->bi_private; + struct f2fs_sb_info *sbi; struct bio_vec *bvec; struct bvec_iter_all iter_all; + iostat_update_and_unbind_ctx(bio, 1); + sbi = bio->bi_private; + if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; @@ -322,14 +329,14 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); - if (IS_DUMMY_WRITTEN_PAGE(page)) { - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); + if (page_private_dummy(page)) { + clear_page_private_dummy(page); unlock_page(page); mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); continue; } @@ -345,7 +352,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && @@ -354,7 +362,7 @@ static void f2fs_write_end_io(struct bio *bio) dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, page)) f2fs_del_fsync_node_entry(sbi, page); - clear_cold_data(page); + clear_page_private_gcing(page); end_page_writeback(page); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && @@ -364,11 +372,8 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } -/* - * Return true, if pre_bio's bdev is same as its target device. - */ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) + block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; @@ -383,10 +388,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } } - if (bio) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); - } + + if (sector) + *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } @@ -403,39 +407,91 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -static bool __same_bdev(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) +static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { - struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); - return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; + unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int fua_flag, meta_flag, io_flag; + blk_opf_t op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + + /* + * data/node io flag bits per temp: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ + if ((1 << fio->temp) & meta_flag) + op_flags |= REQ_META; + if ((1 << fio->temp) & fua_flag) + op_flags |= REQ_FUA; + return op_flags; } -/* - * Low-level block read/write IO operations. - */ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; + struct block_device *bdev; + sector_t sector; struct bio *bio; - bio = f2fs_bio_alloc(sbi, npages, true); - - f2fs_target_device(sbi, fio->new_blkaddr, bio); + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); + bio = bio_alloc_bioset(bdev, npages, + fio->op | fio->op_flags | f2fs_io_flags(fio), + GFP_NOIO, &f2fs_bioset); + bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, - fio->type, fio->temp); } + iostat_alloc_and_bind_ctx(sbi, bio, NULL); + if (fio->io_wbc) wbc_init_bio(fio->io_wbc, bio); return bio; } +static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + pgoff_t first_idx, + const struct f2fs_io_info *fio, + gfp_t gfp_mask) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (!fio || !fio->encrypted_page) + fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); +} + +static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, + pgoff_t next_idx, + const struct f2fs_io_info *fio) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (fio && fio->encrypted_page) + return !bio_has_crypt_ctx(bio); + + return fscrypt_mergeable_bio(bio, inode, next_idx); +} + static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { @@ -445,10 +501,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (test_opt(sbi, LFS) && current->plug) + if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); - if (F2FS_IO_ALIGNED(sbi)) + if (!F2FS_IO_ALIGNED(sbi)) goto submit_io; start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; @@ -464,10 +520,11 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); - zero_user_segment(page, 0, PAGE_SIZE); - SetPagePrivate(page); - set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); lock_page(page); + + zero_user_segment(page, 0, PAGE_SIZE); + set_page_private_dummy(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); } @@ -483,6 +540,8 @@ submit_io: trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); + + iostat_update_submit_ctx(bio, type); submit_bio(bio); } @@ -499,8 +558,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); - if (is_read_io(fio->op)) trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); else @@ -547,24 +604,51 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; + io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); - up_write(&io->io_rwsem); + f2fs_up_write(&io->io_rwsem); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -579,9 +663,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_read(&io->io_rwsem); + f2fs_down_read(&io->io_rwsem); ret = __has_merged_page(io->bio, inode, page, ino); - up_read(&io->io_rwsem); + f2fs_up_read(&io->io_rwsem); } if (ret) __f2fs_submit_merged_write(sbi, type, temp); @@ -623,15 +707,19 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); + f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, + fio->page->index, fio, GFP_NOIO); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; @@ -640,10 +728,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); - bio_set_op_attrs(bio, fio->op, fio->op_flags); - inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; @@ -652,9 +738,12 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { + if (unlikely(sbi->max_io_bytes && + bio->bi_iter.bi_size >= sbi->max_io_bytes)) + return false; if (last_blkaddr + 1 != cur_blkaddr) return false; - return __same_bdev(sbi, cur_blkaddr, bio); + return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); } static bool io_type_is_mergeable(struct f2fs_bio_info *io, @@ -692,16 +781,16 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; - be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); be->bio = bio; bio_get(bio); if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) f2fs_bug_on(sbi, 1); - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } static void del_bio_entry(struct bio_entry *be) @@ -710,9 +799,10 @@ static void del_bio_entry(struct bio_entry *be) kmem_cache_free(bio_entry_slab, be); } -static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, +static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { + struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; int ret = -EAGAIN; @@ -722,25 +812,31 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, struct list_head *head = &io->bio_list; struct bio_entry *be; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (be->bio != *bio) continue; found = true; - if (bio_add_page(*bio, page, PAGE_SIZE, 0) == - PAGE_SIZE) { + f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio, + *fio->last_block, + fio->new_blkaddr)); + if (f2fs_crypt_mergeable_bio(*bio, + fio->page->mapping->host, + fio->page->index, fio) && + bio_add_page(*bio, page, PAGE_SIZE, 0) == + PAGE_SIZE) { ret = 0; break; } - /* bio is full */ + /* page can't be merged into bio; submit the bio */ del_bio_entry(be); __submit_bio(sbi, *bio, DATA); break; } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (ret) { @@ -766,7 +862,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (list_empty(head)) continue; - down_read(&io->bio_list_lock); + f2fs_down_read(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -776,14 +872,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (found) break; } - up_read(&io->bio_list_lock); + f2fs_up_read(&io->bio_list_lock); if (!found) continue; found = false; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -796,7 +892,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, break; } } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (found) @@ -814,23 +910,25 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { - bio = __bio_alloc(fio, BIO_MAX_PAGES); - bio_set_op_attrs(bio, fio->op, fio->op_flags); + bio = __bio_alloc(fio, BIO_MAX_VECS); + f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, + fio->page->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { - if (add_ipu_page(fio->sbi, &bio, page)) + if (add_ipu_page(fio, &bio, page)) goto alloc_new; } @@ -854,7 +952,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); next: if (fio->in_list) { spin_lock(&io->io_lock); @@ -882,8 +980,11 @@ next: inc_page_count(sbi, WB_DATA_TYPE(bio_page)); - if (io->bio && !io_is_mergeable(sbi, io->bio, io, fio, - io->last_block_in_bio, fio->new_blkaddr)) + if (io->bio && + (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, + fio->new_blkaddr) || + !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host, + bio_page->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -894,7 +995,9 @@ alloc_new: fio->retry = true; goto skip; } - io->bio = __bio_alloc(fio, BIO_MAX_PAGES); + io->bio = __bio_alloc(fio, BIO_MAX_VECS); + f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, + bio_page->index, fio, GFP_NOIO); io->fio = *fio; } @@ -907,7 +1010,6 @@ alloc_new: wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; - f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); skip: @@ -917,65 +1019,66 @@ out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); - up_write(&io->io_rwsem); -} - -static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) -{ - return fsverity_active(inode) && - idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + f2fs_up_write(&io->io_rwsem); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx) + unsigned nr_pages, blk_opf_t op_flag, + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - struct bio_post_read_ctx *ctx; + struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; + sector_t sector; + struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); + bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), + REQ_OP_READ | op_flag, + for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); - f2fs_target_device(sbi, blkaddr, bio); + bio->bi_iter.bi_sector = sector; + f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, op_flag); - if (f2fs_encrypted_file(inode)) - post_read_steps |= 1 << STEP_DECRYPT; - if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS; + if (fscrypt_inode_uses_fs_layer_crypto(inode)) + post_read_steps |= STEP_DECRYPT; + if (f2fs_need_verity(inode, first_idx)) - post_read_steps |= 1 << STEP_VERITY; + post_read_steps |= STEP_VERITY; - if (post_read_steps) { + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; + ctx->fs_blkaddr = blkaddr; bio->bi_private = ctx; } + iostat_alloc_and_bind_ctx(sbi, bio, ctx); return bio; } -static void f2fs_release_read_bio(struct bio *bio) -{ - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); -} - /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, - block_t blkaddr) + block_t blkaddr, blk_opf_t op_flags, + bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index); + bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -988,6 +1091,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, } ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); return 0; } @@ -1047,8 +1151,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + block_t blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -1090,7 +1194,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -1102,12 +1206,12 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write) + blk_opf_t op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err; page = f2fs_grab_cache_page(mapping, index, for_write); @@ -1119,6 +1223,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1139,6 +1245,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1162,7 +1270,8 @@ got_it: return page; } - err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, + op_flags, for_write); if (err) goto put_err; return page; @@ -1296,12 +1405,11 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr != NULL_ADDR) goto alloc; @@ -1312,73 +1420,23 @@ alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL, false); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + &sum, seg_type, NULL); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); + } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); - - /* - * i_size will be updated by direct_IO. Otherwise, we'll get stale - * data from unwritten block via dio_read. - */ return 0; } -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct f2fs_map_blocks map; - int flag; - int err = 0; - bool direct_io = iocb->ki_flags & IOCB_DIRECT; - - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); - if (map.m_len > map.m_lblk) - map.m_len -= map.m_lblk; - else - map.m_len = 0; - - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = true; - - if (direct_io) { - map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, iocb, from) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO; - goto map_blocks; - } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - if (f2fs_has_inline_data(inode)) - return err; - - flag = F2FS_GET_BLOCK_PRE_AIO; - -map_blocks: - err = f2fs_map_blocks(inode, &map, 1, flag); - if (map.m_len > 0 && err == -ENOSPC) { - if (!direct_io) - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } - return err; -} - -void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) - down_read(&sbi->node_change); + f2fs_down_read(&sbi->node_change); else - up_read(&sbi->node_change); + f2fs_up_read(&sbi->node_change); } else { if (lock) f2fs_lock_op(sbi); @@ -1388,13 +1446,9 @@ void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) } /* - * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with - * f2fs_map_blocks structure. - * If original data blocks are allocated, then give them to blockdev. - * Otherwise, - * a. preallocate requested block addresses - * b. do not use extent cache for better performance - * c. give the block addresses to blockdev + * f2fs_map_blocks() tries to find or build mapping relationship which + * maps continuous logical blocks to physical blocks, and return such + * info via f2fs_map_blocks structure. */ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) @@ -1407,13 +1461,18 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; + int bidx = 0; if (!maxblocks) return 0; + map->m_bdev = inode->i_sb->s_bdev; + map->m_multidev_dio = + f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); + map->m_len = 0; map->m_flags = 0; @@ -1422,7 +1481,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { - if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1436,12 +1495,27 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + map->m_len = min(map->m_len, + FDEV(bidx).end_blk + 1 - map->m_pblk); + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + } goto out; } next_dnode: if (map->m_may_create) - __do_map_lock(sbi, flag, true); + f2fs_do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -1449,7 +1523,21 @@ next_dnode: if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; + if (err == -ENOENT) { + /* + * There is one exceptional case that read_node_page() + * may return -ENOENT due to filesystem has been + * shutdown or cp_error, so force to convert error + * number to EIO for such case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_cp_error(sbi))) { + err = -EIO; + goto unlock_out; + } + err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = @@ -1467,17 +1555,18 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } if (__is_valid_data_blkaddr(blkaddr)) { /* use out-place-update for driect IO under LFS mode */ - if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) { err = __allocate_data_block(&dn, map->m_seg_type); if (err) @@ -1501,14 +1590,26 @@ next_block: flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + } } if (err) goto sync_out; map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn) && + (flag != F2FS_GET_BLOCK_FIEMAP || + IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); + goto sync_out; + } if (flag == F2FS_GET_BLOCK_BMAP) { map->m_pblk = 0; goto sync_out; @@ -1533,6 +1634,9 @@ next_block: if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; + if (map->m_multidev_dio) + bidx = f2fs_target_device_index(sbi, blkaddr); + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -1541,10 +1645,15 @@ next_block: map->m_pblk = blkaddr; map->m_len = 1; + + if (map->m_multidev_dio) + map->m_bdev = FDEV(bidx).bdev; } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || flag == F2FS_GET_BLOCK_PRE_DIO) { + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + goto sync_out; ofs++; map->m_len++; } else { @@ -1590,18 +1699,38 @@ skip: f2fs_put_dnode(&dn); if (map->m_may_create) { - __do_map_lock(sbi, flag, false); + f2fs_do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; sync_out: - /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + /* + * for hardware encryption, but to avoid potential issue + * in future + */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + + f2fs_bug_on(sbi, blk_addr + map->m_len > + FDEV(bidx).end_blk + 1); + } + } + if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; @@ -1616,11 +1745,11 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { - __do_map_lock(sbi, flag, false); + f2fs_do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: - trace_f2fs_map_blocks(inode, map, err); + trace_f2fs_map_blocks(inode, map, create, flag, err); return err; } @@ -1650,76 +1779,14 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) return true; } -static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs, int seg_type, bool may_write) -{ - struct f2fs_map_blocks map; - int err; - - map.m_lblk = iblock; - map.m_len = bh->b_size >> inode->i_blkbits; - map.m_next_pgofs = next_pgofs; - map.m_next_extent = NULL; - map.m_seg_type = seg_type; - map.m_may_create = may_write; - - err = f2fs_map_blocks(inode, &map, create, flag); - if (!err) { - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = (u64)map.m_len << inode->i_blkbits; - } - return err; -} - -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int flag, - pgoff_t *next_pgofs) -{ - return __get_data_block(inode, iblock, bh_result, create, - flag, next_pgofs, - NO_CHECK_TYPE, create); -} - -static int get_data_block_dio_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - IS_SWAPFILE(inode) ? false : true); -} - -static int get_data_block_dio(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) { - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - false); -} - -static int get_data_block_bmap(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) - return -EFBIG; - - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP, NULL, - NO_CHECK_TYPE, create); + return (bytes >> inode->i_blkbits); } -static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +static inline u64 blks_to_bytes(struct inode *inode, u64 blks) { - return (offset >> inode->i_blkbits); -} - -static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) -{ - return (blk << inode->i_blkbits); + return (blks << inode->i_blkbits); } static int f2fs_xattr_fiemap(struct inode *inode, @@ -1741,13 +1808,13 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } - phys = (__u64)blk_to_logical(inode, ni.blk_addr); + phys = blks_to_bytes(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); @@ -1763,7 +1830,8 @@ static int f2fs_xattr_fiemap(struct inode *inode, flags |= FIEMAP_EXTENT_LAST; err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); - if (err || err == 1) + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); + if (err) return err; } @@ -1772,13 +1840,13 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } - phys = (__u64)blk_to_logical(inode, ni.blk_addr); + phys = blks_to_bytes(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); @@ -1786,21 +1854,46 @@ static int f2fs_xattr_fiemap(struct inode *inode, flags = FIEMAP_EXTENT_LAST; } - if (phys) + if (phys) { err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); + } return (err < 0 ? err : 0); } +static loff_t max_inode_blocks(struct inode *inode) +{ + loff_t result = ADDRS_PER_INODE(inode); + loff_t leaf_count = ADDRS_PER_BLOCK(inode); + + /* two direct node blocks */ + result += (leaf_count * 2); + + /* two indirect node blocks */ + leaf_count *= NIDS_PER_BLOCK; + result += (leaf_count * 2); + + /* one double indirect node block */ + leaf_count *= NIDS_PER_BLOCK; + result += leaf_count; + + return result; +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - struct buffer_head map_bh; + struct f2fs_map_blocks map; sector_t start_blk, last_blk; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; + bool compr_cluster = false, compr_appended; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int count_in_cluster = 0; + loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); @@ -1808,12 +1901,21 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); if (ret) return ret; inode_lock(inode); + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + if (start > maxbytes) { + ret = -EFBIG; + goto out; + } + + if (len > maxbytes || (maxbytes - len) < start) + len = maxbytes - start; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; @@ -1825,51 +1927,93 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (logical_to_blk(inode, len) == 0) - len = blk_to_logical(inode, 1); + if (bytes_to_blks(inode, len) == 0) + len = blks_to_bytes(inode, 1); - start_blk = logical_to_blk(inode, start); - last_blk = logical_to_blk(inode, start + len - 1); + start_blk = bytes_to_blks(inode, start); + last_blk = bytes_to_blks(inode, start + len - 1); next: - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len; + memset(&map, 0, sizeof(map)); + map.m_lblk = start_blk; + map.m_len = bytes_to_blks(inode, len); + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; + + if (compr_cluster) { + map.m_lblk += 1; + map.m_len = cluster_size - count_in_cluster; + } - ret = get_data_block(inode, start_blk, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ - if (!buffer_mapped(&map_bh)) { + if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; - if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, - F2FS_I_SB(inode)->max_file_blocks)) + if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, + max_inode_blocks(inode))) goto prep_next; flags |= FIEMAP_EXTENT_LAST; } + compr_appended = false; + /* In a case of compressed cluster, append this to the last extent */ + if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) || + !(map.m_flags & F2FS_MAP_FLAGS))) { + compr_appended = true; + goto skip_fill; + } + if (size) { + flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); + trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); + if (ret) + goto out; + size = 0; } - if (start_blk > last_blk || ret) + if (start_blk > last_blk) goto out; - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; +skip_fill: + if (map.m_pblk == COMPRESS_ADDR) { + compr_cluster = true; + count_in_cluster = 1; + } else if (compr_appended) { + unsigned int appended_blks = cluster_size - + count_in_cluster + 1; + size += blks_to_bytes(inode, appended_blks); + start_blk += appended_blks; + compr_cluster = false; + } else { + logical = blks_to_bytes(inode, start_blk); + phys = __is_valid_data_blkaddr(map.m_pblk) ? + blks_to_bytes(inode, map.m_pblk) : 0; + size = blks_to_bytes(inode, map.m_len); + flags = 0; + + if (compr_cluster) { + flags = FIEMAP_EXTENT_ENCODED; + count_in_cluster += map.m_len; + if (count_in_cluster == cluster_size) { + compr_cluster = false; + size += blks_to_bytes(inode, 1); + } + } else if (map.m_flags & F2FS_MAP_UNWRITTEN) { + flags = FIEMAP_EXTENT_UNWRITTEN; + } - start_blk += logical_to_blk(inode, size); + start_blk += bytes_to_blks(inode, size); + } prep_next: cond_resched(); @@ -1902,8 +2046,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, bool is_readahead) { struct bio *bio = *bio_ret; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; + const unsigned blocksize = blks_to_bytes(inode, 1); sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -1912,8 +2055,8 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, block_in_file = (sector_t)page_index(page); last_block = block_in_file + nr_pages; - last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >> - blkbits; + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -1943,15 +2086,11 @@ got_it: block_nr = map->m_pblk + block_in_file - map->m_lblk; SetPageMappedToDisk(page); - if (!PageUptodate(page) && (!PageSwapCache(page) && - !cleancache_get_page(page))) { - SetPageUptodate(page); - goto confused; - } - if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto out; } } else { @@ -1972,15 +2111,17 @@ zero_out: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && !page_is_mergeable(F2FS_I_SB(inode), bio, - *last_block_in_bio, block_nr)) { + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, + *last_block_in_bio, block_nr) || + !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, - is_readahead ? REQ_RAHEAD : 0, page->index); + is_readahead ? REQ_RAHEAD : 0, page->index, + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -1998,15 +2139,11 @@ submit_and_realloc: goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = block_nr; goto out; -confused: - if (bio) { - __submit_bio(F2FS_I_SB(inode), bio, DATA); - bio = NULL; - } - unlock_page(page); out: *bio_ret = bio; return ret; @@ -2015,7 +2152,7 @@ out: #ifdef CONFIG_F2FS_FS_COMPRESSION int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, - bool is_readahead) + bool is_readahead, bool for_write) { struct dnode_of_data dn; struct inode *inode = cc->inode; @@ -2023,15 +2160,17 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; + const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; + struct extent_info ei = {0, }; + bool from_dnode = true; int i; int ret = 0; f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -2047,6 +2186,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, continue; } unlock_page(page); + if (for_write) + put_page(page); cc->rpages[i] = NULL; cc->nr_rpages--; } @@ -2055,20 +2196,26 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; + if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + from_dnode = false; + + if (!from_dnode) + goto skip_reading_dnode; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) goto out; - /* cluster was overwritten as normal cluster */ - if (dn.data_blkaddr != COMPRESS_ADDR) - goto out; + f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); +skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, - dn.ofs_in_node + i); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i) : + ei.blk + i - 1; if (!__is_valid_data_blkaddr(blkaddr)) break; @@ -2078,6 +2225,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } cc->nr_cpages++; + + if (!from_dnode && i >= ei.c_len) + break; } /* nothing to decompress */ @@ -2092,15 +2242,26 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - for (i = 0; i < dic->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; - blkaddr = datablock_addr(dn.inode, dn.node_page, - dn.ofs_in_node + i + 1); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1) : + ei.blk + i; + + f2fs_wait_on_block_writeback(inode, blkaddr); - if (bio && !page_is_mergeable(sbi, bio, - *last_block_in_bio, blkaddr)) { + if (f2fs_load_compressed_page(sbi, page, blkaddr)) { + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic, true); + continue; + } + + if (bio && (!page_is_mergeable(sbi, bio, + *last_block_in_bio, blkaddr) || + !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: __submit_bio(sbi, bio, DATA); bio = NULL; @@ -2109,42 +2270,46 @@ submit_and_realloc: if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index); + page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - bio = NULL; - dic->failed = true; - if (refcount_sub_and_test(dic->nr_cpages - i, - &dic->ref)) - f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); - *bio_ret = bio; + *bio_ret = NULL; return ret; } } - f2fs_wait_on_block_writeback(inode, blkaddr); - if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; + ctx = get_post_read_ctx(bio); + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); + inc_page_count(sbi, F2FS_RD_DATA); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = blkaddr; } - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); *bio_ret = bio; return 0; out_put_dnode: - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); out: - f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } *bio_ret = bio; return ret; } @@ -2153,19 +2318,12 @@ out: /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. - * - * Note that the aops->readpages() function is ONLY used for read-ahead. If - * this function ever deviates from doing just read-ahead, it should either - * use ->readpage() or do the necessary surgery to decouple ->readpages() - * from read-ahead. */ -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +static int f2fs_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { @@ -2178,7 +2336,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, .nr_rpages = 0, .nr_cpages = 0, }; + pgoff_t nc_cluster_idx = NULL_CLUSTER; #endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; @@ -2192,15 +2352,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, map.m_may_create = false; for (; nr_pages; nr_pages--) { - if (pages) { - page = list_last_entry(pages, struct page, lru); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page_index(page), - readahead_gfp_mask(mapping))) - goto next_page; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2210,17 +2364,28 @@ int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead); - f2fs_destroy_compress_ctx(&cc); + rac != NULL, false); + f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } - ret = f2fs_is_compressed_cluster(inode, page->index); - if (ret < 0) - goto set_error_page; - else if (!ret) - goto read_single_page; + if (cc.cluster_idx == NULL_CLUSTER) { + if (nc_cluster_idx == + page->index >> cc.log_cluster_size) { + goto read_single_page; + } + + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) { + nc_cluster_idx = + page->index >> cc.log_cluster_size; + goto read_single_page; + } + nc_cluster_idx = NULL_CLUSTER; + } ret = f2fs_init_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2233,7 +2398,7 @@ read_single_page: #endif ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, - &bio, &last_block_in_bio, is_readahead); + &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: @@ -2242,8 +2407,10 @@ set_error_page: zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } +#ifdef CONFIG_F2FS_FS_COMPRESSION next_page: - if (pages) +#endif + if (rac) put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2253,20 +2420,20 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead); - f2fs_destroy_compress_ctx(&cc); + rac != NULL, false); + f2fs_destroy_compress_ctx(&cc, false); } } #endif } - BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - return pages ? 0 : ret; + return ret; } -static int f2fs_read_data_page(struct file *file, struct page *page) +static int f2fs_read_data_folio(struct file *file, struct folio *folio) { + struct page *page = &folio->page; struct inode *inode = page_file_mapping(page)->host; int ret = -EAGAIN; @@ -2281,28 +2448,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page_file_mapping(page), - NULL, page, 1, false); + ret = f2fs_mpage_readpages(inode, NULL, page); return ret; } -static int f2fs_read_data_pages(struct file *file, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void f2fs_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; - struct page *page = list_last_entry(pages, struct page, lru); + struct inode *inode = rac->mapping->host; - trace_f2fs_readpages(inode, page, nr_pages); + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) - return 0; + return; - /* If the file has inline data, skip readpages */ + /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) - return 0; + return; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); + f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) @@ -2319,6 +2482,9 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); + if (fscrypt_inode_uses_inline_crypto(inode)) + return 0; + retry_encrypt: fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, PAGE_SIZE, 0, gfp_flags); @@ -2326,7 +2492,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } @@ -2349,6 +2515,9 @@ static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; + if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) && + is_inode_flag_set(inode, FI_OPU_WRITE)) + return false; if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) @@ -2383,11 +2552,15 @@ static inline bool check_inplace_update_policy(struct inode *inode, bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return false; + if (f2fs_is_pinned_file(inode)) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); @@ -2397,7 +2570,12 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if (test_opt(sbi, LFS)) + /* The below cases were checked when setting it. */ + if (f2fs_is_pinned_file(inode)) + return false; + if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; + if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) return true; @@ -2405,10 +2583,18 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return true; + + if (is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + if (fio) { - if (is_cold_data(fio->page)) + if (page_private_gcing(fio->page)) return true; - if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + if (page_private_dummy(fio->page)) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) @@ -2432,19 +2618,27 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; struct node_info ni; bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2464,7 +2658,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); goto out_writepage; } got_it: @@ -2472,8 +2666,10 @@ got_it: !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -2492,7 +2688,7 @@ got_it: f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); if (err) { - if (f2fs_encrypted_file(inode)) + if (fscrypt_inode_uses_fs_layer_crypto(inode)) fscrypt_finalize_bounce_page(&fio->encrypted_page); if (PageWriteback(page)) end_page_writeback(page); @@ -2511,7 +2707,7 @@ got_it: fio->need_lock = LOCK_REQ; } - err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; @@ -2546,7 +2742,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks) + int compr_blocks, + bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2569,6 +2766,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .submitted = false, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, + .post_read = f2fs_post_read_required(inode), .io_type = io_type, .io_wbc = wbc, .bio = bio, @@ -2609,16 +2807,23 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; - /* Dentry blocks are controlled by checkpoint */ - if (S_ISDIR(inode->i_mode)) { + /* Dentry/quota blocks are controlled by checkpoint */ + if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + if (IS_NOQUOTA(inode)) + f2fs_down_read(&sbi->node_write); + fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); + + if (IS_NOQUOTA(inode)) + f2fs_up_read(&sbi->node_write); + goto done; } @@ -2647,10 +2852,10 @@ write: if (err) { file_set_keep_isize(inode); } else { - down_write(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - up_write(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); } done: @@ -2661,7 +2866,7 @@ out: inode_dec_dirty_pages(inode); if (err) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); } if (wbc->for_reclaim) { @@ -2672,7 +2877,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) + !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2719,7 +2924,7 @@ out: #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0); + wbc, FS_DATA_IO, 0, true); } /* @@ -2733,7 +2938,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0, retry = 0; - struct pagevec pvec; + struct page *pages[F2FS_ONSTACK_PAGES]; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; @@ -2747,6 +2952,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, + .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, @@ -2754,19 +2960,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping, }; #endif int nr_pages; - pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; xa_mark_t tag; int nwritten = 0; int submitted = 0; int i; - pagevec_init(&pvec); - if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); @@ -2774,19 +2976,13 @@ static int f2fs_write_cache_pages(struct address_space *mapping, clear_inode_flag(mapping->host, FI_HOT_DATA); if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; + index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -2798,18 +2994,22 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); + nr_pages = find_get_pages_range_tag(mapping, &index, end, + tag, F2FS_ONSTACK_PAGES, pages); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + struct page *page = pages[i]; bool need_readd; readd: need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; @@ -2828,27 +3028,27 @@ readd: if (unlikely(f2fs_cp_error(sbi))) goto lock_page; - if (f2fs_cluster_is_empty(&cc)) { - void *fsdata = NULL; - struct page *pagep; - int ret2; + if (!f2fs_cluster_is_empty(&cc)) + goto lock_page; - ret2 = f2fs_prepare_compress_overwrite( + if (f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, true)) + goto lock_page; + + ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, page->index, &fsdata); - if (ret2 < 0) { - ret = ret2; - done = 1; - break; - } else if (ret2 && - !f2fs_compress_write_end(inode, - fsdata, page->index, - 1)) { - retry = 1; - break; - } - } else { - goto lock_page; + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + (!f2fs_compress_write_end(inode, + fsdata, page->index, 1) || + !f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, false))) { + retry = 1; + break; } } #endif @@ -2895,7 +3095,8 @@ continue_unlock: } #endif ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, 0); + &bio, &last_block, wbc, io_type, + 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) unlock_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2915,9 +3116,8 @@ result: } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - HZ/50); + f2fs_io_schedule_timeout( + DEFAULT_IO_TIMEOUT); goto retry_write; } goto next; @@ -2936,7 +3136,7 @@ next: if (need_readd) goto readd; } - pagevec_release(&pvec); + release_pages(pages, nr_pages); cond_resched(); } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2950,13 +3150,16 @@ next: retry = 0; } } + if (f2fs_compressed_file(inode)) + f2fs_destroy_compress_ctx(&cc, false); #endif - if ((!cycled && !done) || retry) { - cycled = 1; + if (retry) { index = 0; - end = writeback_index - 1; + end = -1; goto retry; } + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; @@ -2973,15 +3176,17 @@ next: static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { + /* to avoid deadlock in path of data flush */ + if (F2FS_I(inode)->wb_task) + return false; + if (!S_ISREG(inode->i_mode)) return false; - if (f2fs_compressed_file(inode)) - return true; if (IS_NOQUOTA(inode)) return false; - /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) - return false; + + if (f2fs_need_compress_data(inode)) + return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) @@ -3017,8 +3222,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; - /* skip writing during file defragment */ - if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + /* skip writing in file defragment preparing stage */ + if (is_inode_flag_set(inode, FI_SKIP_WRITES)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, DATA); @@ -3026,8 +3231,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); - else if (atomic_read(&sbi->wb_sync_req[DATA])) + else if (atomic_read(&sbi->wb_sync_req[DATA])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); @@ -3067,9 +3276,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct address_space *mapping, loff_t to) +void f2fs_write_failed(struct inode *inode, loff_t to) { - struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (IS_NOQUOTA(inode)) @@ -3077,14 +3285,14 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3097,17 +3305,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err = 0; int flag; /* - * we already allocated all the blocks, so we don't need to get - * the block addresses when there is no need to fill the page. + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC) && - !f2fs_verity_in_progress(inode)) + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ @@ -3118,7 +3324,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - __do_map_lock(sbi, flag, true); + f2fs_do_map_lock(sbi, flag, true); locked = true; } @@ -3137,7 +3343,7 @@ restart: f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_inline_node(ipage); + set_page_private_inline(ipage); } else { err = f2fs_convert_inline_page(&dn, page); if (err) @@ -3155,7 +3361,7 @@ restart: err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; @@ -3171,37 +3377,123 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - __do_map_lock(sbi, flag, false); + f2fs_do_map_lock(sbi, flag, false); + return err; +} + +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr = NULL_ADDR; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + goto reserve_block; + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + +reserve_block: + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + inc_atomic_write_cnt(inode); + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; - trace_f2fs_write_begin(inode, pos, len, flags); + trace_f2fs_write_begin(inode, pos, len); if (!f2fs_is_checkpoint_ready(sbi)) { err = -ENOSPC; goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3219,6 +3511,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *fsdata = NULL; + if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) + goto repeat; + ret = f2fs_prepare_compress_overwrite(inode, pagep, index, fsdata); if (ret < 0) { @@ -3246,7 +3541,11 @@ repeat: *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3281,9 +3580,10 @@ repeat: if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } - err = f2fs_submit_page_read(inode, page, blkaddr); + err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); if (err) goto fail; @@ -3301,9 +3601,7 @@ repeat: fail: f2fs_put_page(page, 1); - f2fs_write_failed(mapping, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); + f2fs_write_failed(inode, pos + len); return err; } @@ -3333,6 +3631,10 @@ static int f2fs_write_end(struct file *file, if (f2fs_compressed_file(inode) && fsdata) { f2fs_compress_write_end(inode, fsdata, page->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) + f2fs_i_size_write(inode, pos + copied); return copied; } #endif @@ -3343,164 +3645,28 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } -static int check_direct_IO(struct inode *inode, struct iov_iter *iter, - loff_t offset) -{ - unsigned i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned blkbits = i_blkbits; - unsigned blocksize_mask = (1 << blkbits) - 1; - unsigned long align = offset | iov_iter_alignment(iter); - struct block_device *bdev = inode->i_sb->s_bdev; - - if (align & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (align & blocksize_mask) - return -EINVAL; - return 1; - } - return 0; -} - -static void f2fs_dio_end_io(struct bio *bio) -{ - struct f2fs_private_dio *dio = bio->bi_private; - - dec_page_count(F2FS_I_SB(dio->inode), - dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - bio->bi_private = dio->orig_private; - bio->bi_end_io = dio->orig_end_io; - - kvfree(dio); - - bio_endio(bio); -} - -static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, - loff_t file_offset) -{ - struct f2fs_private_dio *dio; - bool write = (bio_op(bio) == REQ_OP_WRITE); - - dio = f2fs_kzalloc(F2FS_I_SB(inode), - sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) - goto out; - - dio->inode = inode; - dio->orig_end_io = bio->bi_end_io; - dio->orig_private = bio->bi_private; - dio->write = write; - - bio->bi_end_io = f2fs_dio_end_io; - bio->bi_private = dio; - - inc_page_count(F2FS_I_SB(inode), - write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - submit_bio(bio); - return; -out: - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); -} - -static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - int rw = iov_iter_rw(iter); - int err; - enum rw_hint hint = iocb->ki_hint; - int whint_mode = F2FS_OPTION(sbi).whint_mode; - bool do_opu; - - err = check_direct_IO(inode, iter, offset); - if (err) - return err < 0 ? err : 0; - - if (f2fs_force_buffered_io(inode, iocb, iter)) - return 0; - - do_opu = allow_outplace_dio(inode, iocb, iter); - - trace_f2fs_direct_IO_enter(inode, offset, count, rw); - - if (rw == WRITE && whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { - up_read(&fi->i_gc_rwsem[rw]); - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - } else { - down_read(&fi->i_gc_rwsem[rw]); - if (do_opu) - down_read(&fi->i_gc_rwsem[READ]); - } - - err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, rw == WRITE ? get_data_block_dio_write : - get_data_block_dio, NULL, f2fs_dio_submit_bio, - DIO_LOCKING | DIO_SKIP_HOLES); - - if (do_opu) - up_read(&fi->i_gc_rwsem[READ]); - - up_read(&fi->i_gc_rwsem[rw]); - - if (rw == WRITE) { - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; - if (err > 0) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - err); - if (!do_opu) - set_inode_flag(inode, FI_UPDATE_WRITE); - } else if (err < 0) { - f2fs_write_failed(mapping, offset + count); - } - } - -out: - trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); - - return err; -} - -void f2fs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length) +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && - (offset % PAGE_SIZE || length != PAGE_SIZE)) + (offset || length != folio_size(folio))) return; - if (PageDirty(page)) { + if (folio_test_dirty(folio)) { if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { @@ -3511,228 +3677,287 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } - clear_cold_data(page); + clear_page_private_gcing(&folio->page); - if (IS_ATOMIC_WRITTEN_PAGE(page)) - return f2fs_drop_inmem_page(inode, page); + if (test_opt(sbi, COMPRESS_CACHE) && + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(&folio->page); - f2fs_clear_page_private(page); + folio_detach_private(folio); } -int f2fs_release_page(struct page *page, gfp_t wait) +bool f2fs_release_folio(struct folio *folio, gfp_t wait) { - /* If this is dirty page, keep PagePrivate */ - if (PageDirty(page)) - return 0; + struct f2fs_sb_info *sbi; - /* This is atomic written page, keep Private */ - if (IS_ATOMIC_WRITTEN_PAGE(page)) - return 0; + /* If this is dirty folio, keep private data */ + if (folio_test_dirty(folio)) + return false; + + sbi = F2FS_M_SB(folio->mapping); + if (test_opt(sbi, COMPRESS_CACHE)) { + struct inode *inode = folio->mapping->host; + + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(&folio->page); + } - clear_cold_data(page); - f2fs_clear_page_private(page); - return 1; + clear_page_private_gcing(&folio->page); + + folio_detach_private(folio); + return true; } -static int f2fs_set_data_page_dirty(struct page *page) +static bool f2fs_dirty_data_folio(struct address_space *mapping, + struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = mapping->host; - trace_f2fs_set_page_dirty(page, DATA); + trace_f2fs_set_page_dirty(&folio->page, DATA); - if (!PageUptodate(page)) - SetPageUptodate(page); - if (PageSwapCache(page)) - return __set_page_dirty_nobuffers(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + BUG_ON(folio_test_swapcache(folio)); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!IS_ATOMIC_WRITTEN_PAGE(page)) { - f2fs_register_inmem_page(inode, page); - return 1; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return 0; + if (filemap_dirty_folio(mapping, folio)) { + f2fs_update_dirty_folio(inode, folio); + return true; } + return false; +} + - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - f2fs_update_dirty_page(inode, page); - return 1; +static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct dnode_of_data dn; + sector_t start_idx, blknr = 0; + int ret; + + start_idx = round_down(block, F2FS_I(inode)->i_cluster_size); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + return 0; + + if (dn.data_blkaddr != COMPRESS_ADDR) { + dn.ofs_in_node += block - start_idx; + blknr = f2fs_data_blkaddr(&dn); + if (!__is_valid_data_blkaddr(blknr)) + blknr = 0; } + + f2fs_put_dnode(&dn); + return blknr; +#else return 0; +#endif } + static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; + sector_t blknr = 0; if (f2fs_has_inline_data(inode)) - return 0; + goto out; /* make sure allocating whole blocks */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); - return generic_block_bmap(mapping, block, get_data_block_bmap); -} + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(block >= max_file_blocks(inode))) + goto out; -#ifdef CONFIG_MIGRATION -#include <linux/migrate.h> + if (f2fs_compressed_file(inode)) { + blknr = f2fs_bmap_compress(inode, block); + } else { + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = block; + map.m_len = 1; + map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; + + if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP)) + blknr = map.m_pblk; + } +out: + trace_f2fs_bmap(inode, block, blknr); + return blknr; +} -int f2fs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +#ifdef CONFIG_SWAP +static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, + unsigned int blkcnt) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int blkofs; + unsigned int blk_per_sec = BLKS_PER_SEC(sbi); + unsigned int secidx = start_blk / blk_per_sec; + unsigned int end_sec = secidx + blkcnt / blk_per_sec; + int ret = 0; - BUG_ON(PageWriteback(page)); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } + set_inode_flag(inode, FI_ALIGNED_WRITE); + set_inode_flag(inode, FI_OPU_WRITE); - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; - rc = migrate_page_move_mapping(mapping, newpage, - page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); - return rc; - } + for (; secidx < end_sec; secidx++) { + f2fs_down_write(&sbi->pin_sem); - if (atomic_written) { - struct inmem_pages *cur; - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; + f2fs_lock_op(sbi); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); + + set_inode_flag(inode, FI_SKIP_WRITES); + + for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { + struct page *page; + unsigned int blkidx = secidx * blk_per_sec + blkofs; + + page = f2fs_get_lock_data_page(inode, blkidx, true); + if (IS_ERR(page)) { + f2fs_up_write(&sbi->pin_sem); + ret = PTR_ERR(page); + goto done; } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } - if (PagePrivate(page)) { - f2fs_set_page_private(newpage, page_private(page)); - f2fs_clear_page_private(page); + set_page_dirty(page); + f2fs_put_page(page, 1); + } + + clear_inode_flag(inode, FI_SKIP_WRITES); + + ret = filemap_fdatawrite(inode->i_mapping); + + f2fs_up_write(&sbi->pin_sem); + + if (ret) + break; } - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); +done: + clear_inode_flag(inode, FI_SKIP_WRITES); + clear_inode_flag(inode, FI_OPU_WRITE); + clear_inode_flag(inode, FI_ALIGNED_WRITE); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - return MIGRATEPAGE_SUCCESS; + return ret; } -#endif -#ifdef CONFIG_SWAP -/* Copied from generic_swapfile_activate() to check any holes */ static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; - unsigned blocks_per_page; - unsigned long page_no; - unsigned blkbits; - sector_t probe_block; - sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + sector_t cur_lblock; + sector_t last_lblock; + sector_t pblock; + sector_t lowest_pblock = -1; + sector_t highest_pblock = 0; int nr_extents = 0; - int ret; - - blkbits = inode->i_blkbits; - blocks_per_page = PAGE_SIZE >> blkbits; + unsigned long nr_pblocks; + unsigned int blks_per_sec = BLKS_PER_SEC(sbi); + unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1; + unsigned int not_aligned = 0; + int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try * to be very smart. */ - probe_block = 0; - page_no = 0; - last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { - unsigned block_in_page; - sector_t first_block; - sector_t block = 0; - int err = 0; + cur_lblock = 0; + last_lblock = bytes_to_blks(inode, i_size_read(inode)); + while (cur_lblock < last_lblock && cur_lblock < sis->max) { + struct f2fs_map_blocks map; +retry: cond_resched(); - block = probe_block; - err = bmap(inode, &block); - if (err || !block) - goto bad_bmap; - first_block = block; + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; - /* - * It must be PAGE_SIZE aligned on-disk - */ - if (first_block & (blocks_per_page - 1)) { - probe_block++; - goto reprobe; + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* hole */ + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes"); + ret = -EINVAL; + goto out; } - for (block_in_page = 1; block_in_page < blocks_per_page; - block_in_page++) { + pblock = map.m_pblk; + nr_pblocks = map.m_len; - block = probe_block + block_in_page; - err = bmap(inode, &block); + if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || + nr_pblocks & sec_blks_mask) { + not_aligned++; - if (err || !block) - goto bad_bmap; + nr_pblocks = roundup(nr_pblocks, blks_per_sec); + if (cur_lblock + nr_pblocks > sis->max) + nr_pblocks -= blks_per_sec; - if (block != first_block + block_in_page) { - /* Discontiguity */ - probe_block++; - goto reprobe; + if (!nr_pblocks) { + /* this extent is last one */ + nr_pblocks = map.m_len; + f2fs_warn(sbi, "Swapfile: last extent is not aligned to section"); + goto next; } - } - first_block >>= (PAGE_SHIFT - blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; + ret = f2fs_migrate_blocks(inode, cur_lblock, + nr_pblocks); + if (ret) + goto out; + goto retry; + } +next: + if (cur_lblock + nr_pblocks >= sis->max) + nr_pblocks = sis->max - cur_lblock; + + if (cur_lblock) { /* exclude the header page */ + if (pblock < lowest_pblock) + lowest_pblock = pblock; + if (pblock + nr_pblocks - 1 > highest_pblock) + highest_pblock = pblock + nr_pblocks - 1; } /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ - ret = add_swap_extent(sis, page_no, 1, first_block); + ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); if (ret < 0) goto out; nr_extents += ret; - page_no++; - probe_block += blocks_per_page; -reprobe: - continue; + cur_lblock += nr_pblocks; } ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; + *span = 1 + highest_pblock - lowest_pblock; + if (cur_lblock == 0) + cur_lblock = 1; /* force Empty message */ + sis->max = cur_lblock; + sis->pages = cur_lblock - 1; + sis->highest_bit = cur_lblock - 1; out: + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)", + not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; -bad_bmap: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -3747,19 +3972,27 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (f2fs_readonly(F2FS_I_SB(inode)->sb)) return -EROFS; + if (f2fs_lfs_mode(F2FS_I_SB(inode))) { + f2fs_err(F2FS_I_SB(inode), + "Swapfile not supported in LFS mode"); + return -EINVAL; + } + ret = f2fs_convert_inline_inode(inode); if (ret) return ret; - if (f2fs_disable_compressed_file(inode)) + if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + f2fs_precache_extents(inode); + ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; + stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); - f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -3768,6 +4001,7 @@ static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); + stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else @@ -3783,22 +4017,20 @@ static void f2fs_swap_deactivate(struct file *file) #endif const struct address_space_operations f2fs_dblock_aops = { - .readpage = f2fs_read_data_page, - .readpages = f2fs_read_data_pages, + .read_folio = f2fs_read_data_folio, + .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, - .set_page_dirty = f2fs_set_data_page_dirty, - .invalidatepage = f2fs_invalidate_page, - .releasepage = f2fs_release_page, - .direct_IO = f2fs_direct_IO, + .dirty_folio = f2fs_dirty_data_folio, + .migrate_folio = filemap_migrate_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif }; void f2fs_clear_page_cache_dirty_tag(struct page *page) @@ -3861,7 +4093,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) int __init f2fs_init_bio_entry_cache(void) { - bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", + bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); if (!bio_entry_slab) return -ENOMEM; @@ -3872,3 +4104,65 @@ void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } + +static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct f2fs_map_blocks map = {}; + pgoff_t next_pgofs = 0; + int err; + + map.m_lblk = bytes_to_blks(inode, offset); + map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + if (flags & IOMAP_WRITE) + map.m_may_create = true; + + err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, + F2FS_GET_BLOCK_DIO); + if (err) + return err; + + iomap->offset = blks_to_bytes(inode, map.m_lblk); + + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + + if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { + iomap->length = blks_to_bytes(inode, map.m_len); + if (map.m_flags & F2FS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + } else { + iomap->type = IOMAP_UNWRITTEN; + } + if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk))) + return -EINVAL; + + iomap->bdev = map.m_bdev; + iomap->addr = blks_to_bytes(inode, map.m_pblk); + } else { + iomap->length = blks_to_bytes(inode, next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } + + if (map.m_flags & F2FS_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + if ((inode->i_state & I_DIRTY_DATASYNC) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + return 0; +} + +const struct iomap_ops f2fs_iomap_ops = { + .iomap_begin = f2fs_iomap_begin, +}; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6b89eae5e4ca..a216dcdf6941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,7 +21,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static DEFINE_MUTEX(f2fs_stat_mutex); +static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); #ifdef CONFIG_DEBUG_FS static struct dentry *f2fs_debugfs_root; #endif @@ -39,7 +39,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = BLKS_PER_SEC(sbi); + blks_per_sec = CAP_BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, true); @@ -91,11 +91,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); + si->aw_cnt = atomic_read(&sbi->atomic_files); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -120,6 +117,13 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -131,7 +135,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); - si->compr_blocks = atomic_read(&sbi->compr_blocks); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); + si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; @@ -145,8 +150,14 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; - si->nats = NM_I(sbi)->nat_cnt; - si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages; + si->compress_page_hit = atomic_read(&sbi->compress_page_hit); + } +#endif + si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; + si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; @@ -154,8 +165,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -164,8 +173,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; si->util_invalid = 50 - si->util_free - si->util_valid; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); @@ -174,6 +184,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) si->meta_count[i] = atomic_read(&sbi->meta_count[i]); + for (i = 0; i < NO_CHECK_TYPE; i++) { + si->dirty_seg[i] = 0; + si->full_seg[i] = 0; + si->valid_blks[i] = 0; + } + + for (i = 0; i < MAIN_SEGS(sbi); i++) { + int blks = get_seg_entry(sbi, i)->valid_blocks; + int type = get_seg_entry(sbi, i)->type; + + if (!blks) + continue; + + if (blks == sbi->blocks_per_seg) + si->full_seg[type]++; + else + si->dirty_seg[type]++; + si->valid_blks[type] += blks; + } + for (i = 0; i < 2; i++) { si->segment_count[i] = sbi->segment_count[i]; si->block_count[i] = sbi->block_count[i]; @@ -258,10 +288,10 @@ get_cache: si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); - si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); - si->cache_mem += NM_I(sbi)->dirty_nat_cnt * - sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); + si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * + sizeof(struct nat_entry_set); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * @@ -272,35 +302,70 @@ get_cache: si->page_mem = 0; if (sbi->node_inode) { unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { unsigned npages = META_MAPPING(sbi)->nrpages; + + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } +#endif } +static char *s_flag[] = { + [SBI_IS_DIRTY] = " fs_dirty", + [SBI_IS_CLOSE] = " closing", + [SBI_NEED_FSCK] = " need_fsck", + [SBI_POR_DOING] = " recovering", + [SBI_NEED_SB_WRITE] = " sb_dirty", + [SBI_NEED_CP] = " need_cp", + [SBI_IS_SHUTDOWN] = " shutdown", + [SBI_IS_RECOVERED] = " recovered", + [SBI_CP_DISABLED] = " cp_disabled", + [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", + [SBI_IS_RESIZEFS] = " resizefs", + [SBI_IS_FREEZING] = " freezefs", +}; + static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; - int i = 0; - int j; + int i = 0, j = 0; + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good")); + "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); + if (si->sbi->s_flag) { + seq_puts(s, "[SBI:"); + for_each_set_bit(j, &si->sbi->s_flag, 32) + seq_puts(s, s_flag[j]); + seq_puts(s, "]\n"); + } seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); + seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n", + ktime_get_boottime_seconds(), + SIT_I(si->sbi)->mounted_time); if (test_opt(si->sbi, DISCARD)) seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", si->utilization, si->valid_count, si->discard_blks); @@ -319,37 +384,67 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n", + seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); - seq_printf(s, " - COLD data: %d, %d, %d\n", + seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n", + "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk"); + seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_COLD_DATA], si->cursec[CURSEG_COLD_DATA], - si->curzone[CURSEG_COLD_DATA]); - seq_printf(s, " - WARM data: %d, %d, %d\n", + si->curzone[CURSEG_COLD_DATA], + si->dirty_seg[CURSEG_COLD_DATA], + si->full_seg[CURSEG_COLD_DATA], + si->valid_blks[CURSEG_COLD_DATA]); + seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_WARM_DATA], si->cursec[CURSEG_WARM_DATA], - si->curzone[CURSEG_WARM_DATA]); - seq_printf(s, " - HOT data: %d, %d, %d\n", + si->curzone[CURSEG_WARM_DATA], + si->dirty_seg[CURSEG_WARM_DATA], + si->full_seg[CURSEG_WARM_DATA], + si->valid_blks[CURSEG_WARM_DATA]); + seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_HOT_DATA], si->cursec[CURSEG_HOT_DATA], - si->curzone[CURSEG_HOT_DATA]); - seq_printf(s, " - Dir dnode: %d, %d, %d\n", + si->curzone[CURSEG_HOT_DATA], + si->dirty_seg[CURSEG_HOT_DATA], + si->full_seg[CURSEG_HOT_DATA], + si->valid_blks[CURSEG_HOT_DATA]); + seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_HOT_NODE], si->cursec[CURSEG_HOT_NODE], - si->curzone[CURSEG_HOT_NODE]); - seq_printf(s, " - File dnode: %d, %d, %d\n", + si->curzone[CURSEG_HOT_NODE], + si->dirty_seg[CURSEG_HOT_NODE], + si->full_seg[CURSEG_HOT_NODE], + si->valid_blks[CURSEG_HOT_NODE]); + seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_WARM_NODE], si->cursec[CURSEG_WARM_NODE], - si->curzone[CURSEG_WARM_NODE]); - seq_printf(s, " - Indir nodes: %d, %d, %d\n", + si->curzone[CURSEG_WARM_NODE], + si->dirty_seg[CURSEG_WARM_NODE], + si->full_seg[CURSEG_WARM_NODE], + si->valid_blks[CURSEG_WARM_NODE]); + seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_COLD_NODE], si->cursec[CURSEG_COLD_NODE], - si->curzone[CURSEG_COLD_NODE]); + si->curzone[CURSEG_COLD_NODE], + si->dirty_seg[CURSEG_COLD_NODE], + si->full_seg[CURSEG_COLD_NODE], + si->valid_blks[CURSEG_COLD_NODE]); + seq_printf(s, " - Pinned file: %8d %8d %8d\n", + si->curseg[CURSEG_COLD_DATA_PINNED], + si->cursec[CURSEG_COLD_DATA_PINNED], + si->curzone[CURSEG_COLD_DATA_PINNED]); + seq_printf(s, " - ATGC data: %8d %8d %8d\n", + si->curseg[CURSEG_ALL_DATA_ATGC], + si->cursec[CURSEG_ALL_DATA_ATGC], + si->curzone[CURSEG_ALL_DATA_ATGC]); seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", si->main_area_segs - si->dirty_count - si->prefree_count - si->free_segs, @@ -365,22 +460,34 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); + seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " + "Idle Greedy (%d), Idle AT (%d), " + "Urgent High (%d), Urgent Mid (%d), " + "Urgent Low (%d)\n", + si->sbi->gc_reclaimed_segs[GC_NORMAL], + si->sbi->gc_reclaimed_segs[GC_IDLE_CB], + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], + si->sbi->gc_reclaimed_segs[GC_IDLE_AT], + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], + si->sbi->gc_reclaimed_segs[GC_URGENT_MID], + si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); @@ -405,10 +512,9 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -421,6 +527,9 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); + seq_printf(s, " - fsync mark: %4lld\n", + percpu_counter_sum_positive( + &si->sbi->rf_node_block_count)); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", @@ -462,7 +571,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -473,6 +582,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + unsigned long flags; int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); @@ -499,18 +609,18 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); - atomic_set(&sbi->compr_blocks, 0); + atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -518,12 +628,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_del(&si->stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); - kvfree(si); + kfree(si); } void __init f2fs_create_root_stats(void) @@ -531,7 +642,7 @@ void __init f2fs_create_root_stats(void) #ifdef CONFIG_DEBUG_FS f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL, &stat_fops); #endif } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 27d0dd7a16d6..21960a899b6a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/signal.h> @@ -15,6 +16,10 @@ #include "xattr.h" #include <trace/events/f2fs.h> +#if IS_ENABLED(CONFIG_UNICODE) +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + static unsigned long dir_blocks(struct inode *inode) { return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) @@ -70,6 +75,114 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) return DT_UNKNOWN; } +/* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ +int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname) +{ +#if IS_ENABLED(CONFIG_UNICODE) + struct super_block *sb = dir->i_sb; + + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { + fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS, false, F2FS_SB(sb)); + if (!fname->cf_name.name) + return -ENOMEM; + fname->cf_name.len = utf8_casefold(sb->s_encoding, + fname->usr_fname, + fname->cf_name.name, + F2FS_NAME_LEN); + if ((int)fname->cf_name.len <= 0) { + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; + if (sb_has_strict_encoding(sb)) + return -EINVAL; + /* fall back to treating name as opaque byte sequence */ + } + } +#endif + return 0; +} + +static int __f2fs_setup_filename(const struct inode *dir, + const struct fscrypt_name *crypt_name, + struct f2fs_filename *fname) +{ + int err; + + memset(fname, 0, sizeof(*fname)); + + fname->usr_fname = crypt_name->usr_fname; + fname->disk_name = crypt_name->disk_name; +#ifdef CONFIG_FS_ENCRYPTION + fname->crypto_buf = crypt_name->crypto_buf; +#endif + if (crypt_name->is_nokey_name) { + /* hash was decoded from the no-key name */ + fname->hash = cpu_to_le32(crypt_name->hash); + } else { + err = f2fs_init_casefolded_name(dir, fname); + if (err) { + f2fs_free_filename(fname); + return err; + } + f2fs_hash_filename(dir, fname); + } + return 0; +} + +/* + * Prepare to search for @iname in @dir. This is similar to + * fscrypt_setup_filename(), but this also handles computing the casefolded name + * and the f2fs dirhash if needed, then packing all the information about this + * filename up into a 'struct f2fs_filename'. + */ +int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname) +{ + struct fscrypt_name crypt_name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &crypt_name); + if (err) + return err; + + return __f2fs_setup_filename(dir, &crypt_name, fname); +} + +/* + * Prepare to look up @dentry in @dir. This is similar to + * fscrypt_prepare_lookup(), but this also handles computing the casefolded name + * and the f2fs dirhash if needed, then packing all the information about this + * filename up into a 'struct f2fs_filename'. + */ +int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct f2fs_filename *fname) +{ + struct fscrypt_name crypt_name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &crypt_name); + if (err) + return err; + + return __f2fs_setup_filename(dir, &crypt_name, fname); +} + +void f2fs_free_filename(struct f2fs_filename *fname) +{ +#ifdef CONFIG_FS_ENCRYPTION + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + if (fname->cf_name.name) { + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; + } +#endif +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -84,134 +197,98 @@ static unsigned long dir_block_index(unsigned int level, static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct page *dentry_page, - struct fscrypt_name *fname, - f2fs_hash_t namehash, - int *max_slots, - struct page **res_page) + const struct f2fs_filename *fname, + int *max_slots) { struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(dir, &d, dentry_blk); - de = f2fs_find_target_dentry(fname, namehash, max_slots, &d); - if (de) - *res_page = dentry_page; - - return de; + return f2fs_find_target_dentry(&d, fname, max_slots); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* * Test whether a case-insensitive directory entry matches the filename * being searched for. * - * Returns: 0 if the directory entry matches, more than 0 if it - * doesn't match or less than zero on error. + * Returns 1 for a match, 0 for no match, and -errno on an error. */ -int f2fs_ci_compare(const struct inode *parent, const struct qstr *name, - const struct qstr *entry, bool quick) +static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, + const u8 *de_name, u32 de_name_len) { - const struct f2fs_sb_info *sbi = F2FS_SB(parent->i_sb); - const struct unicode_map *um = sbi->s_encoding; - int ret; + const struct super_block *sb = dir->i_sb; + const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); + struct qstr entry = QSTR_INIT(de_name, de_name_len); + int res; - if (quick) - ret = utf8_strncasecmp_folded(um, name, entry); - else - ret = utf8_strncasecmp(um, name, entry); + if (IS_ENCRYPTED(dir)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT((u8 *)de_name, de_name_len); - if (ret < 0) { - /* Handle invalid character sequence as either an error - * or as an opaque byte sequence. - */ - if (f2fs_has_strict_mode(sbi)) + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))) return -EINVAL; - if (name->len != entry->len) - return 1; - - return !!memcmp(name->name, entry->name, name->len); - } - - return ret; -} - -static void f2fs_fname_setup_ci_filename(struct inode *dir, - const struct qstr *iname, - struct fscrypt_str *cf_name) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - - if (!IS_CASEFOLDED(dir)) { - cf_name->name = NULL; - return; + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name, + &decrypted_name); + if (res < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; } - cf_name->name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, GFP_NOFS); - if (!cf_name->name) - return; - - cf_name->len = utf8_casefold(sbi->s_encoding, - iname, cf_name->name, - F2FS_NAME_LEN); - if ((int)cf_name->len <= 0) { - kvfree(cf_name->name); - cf_name->name = NULL; + res = utf8_strncasecmp_folded(um, name, &entry); + /* + * In strict mode, ignore invalid names. In non-strict mode, + * fall back to treating them as opaque byte sequences. + */ + if (res < 0 && !sb_has_strict_encoding(sb)) { + res = name->len == entry.len && + memcmp(name->name, entry.name, name->len) == 0; + } else { + /* utf8_strncasecmp_folded returns 0 on match */ + res = (res == 0); } +out: + kfree(decrypted_name.name); + return res; } -#endif +#endif /* CONFIG_UNICODE */ -static inline bool f2fs_match_name(struct f2fs_dentry_ptr *d, - struct f2fs_dir_entry *de, - struct fscrypt_name *fname, - struct fscrypt_str *cf_str, - unsigned long bit_pos, - f2fs_hash_t namehash) +static inline int f2fs_match_name(const struct inode *dir, + const struct f2fs_filename *fname, + const u8 *de_name, u32 de_name_len) { -#ifdef CONFIG_UNICODE - struct inode *parent = d->inode; - struct f2fs_sb_info *sbi = F2FS_I_SB(parent); - struct qstr entry; -#endif + struct fscrypt_name f; - if (de->hash_code != namehash) - return false; +#if IS_ENABLED(CONFIG_UNICODE) + if (fname->cf_name.name) { + struct qstr cf = FSTR_TO_QSTR(&fname->cf_name); -#ifdef CONFIG_UNICODE - entry.name = d->filename[bit_pos]; - entry.len = de->name_len; - - if (sbi->s_encoding && IS_CASEFOLDED(parent)) { - if (cf_str->name) { - struct qstr cf = {.name = cf_str->name, - .len = cf_str->len}; - return !f2fs_ci_compare(parent, &cf, &entry, true); - } - return !f2fs_ci_compare(parent, fname->usr_fname, &entry, - false); + return f2fs_match_ci_name(dir, &cf, de_name, de_name_len); } #endif - if (fscrypt_match_name(fname, d->filename[bit_pos], - le16_to_cpu(de->name_len))) - return true; - return false; + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; +#ifdef CONFIG_FS_ENCRYPTION + f.crypto_buf = fname->crypto_buf; +#endif + return fscrypt_match_name(&f, de_name, de_name_len); } -struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, - f2fs_hash_t namehash, int *max_slots, - struct f2fs_dentry_ptr *d) +struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, + const struct f2fs_filename *fname, int *max_slots) { struct f2fs_dir_entry *de; - struct fscrypt_str cf_str = { .name = NULL, .len = 0 }; unsigned long bit_pos = 0; int max_len = 0; - -#ifdef CONFIG_UNICODE - f2fs_fname_setup_ci_filename(d->inode, fname->usr_fname, &cf_str); -#endif + int res = 0; if (max_slots) *max_slots = 0; @@ -229,8 +306,15 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, continue; } - if (f2fs_match_name(d, de, fname, &cf_str, bit_pos, namehash)) - goto found; + if (de->hash_code == fname->hash) { + res = f2fs_match_name(d->inode, fname, + d->filename[bit_pos], + le16_to_cpu(de->name_len)); + if (res < 0) + return ERR_PTR(res); + if (res) + goto found; + } if (max_slots && max_len > *max_slots) *max_slots = max_len; @@ -243,33 +327,27 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, found: if (max_slots && max_len > *max_slots) *max_slots = max_len; - -#ifdef CONFIG_UNICODE - kvfree(cf_str.name); -#endif return de; } static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, - struct fscrypt_name *fname, + const struct f2fs_filename *fname, struct page **res_page) { - struct qstr name = FSTR_TO_QSTR(&fname->disk_name); - int s = GET_DENTRY_SLOTS(name.len); + int s = GET_DENTRY_SLOTS(fname->disk_name.len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash = f2fs_dentry_hash(dir, &name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, - le32_to_cpu(namehash) % nbucket); + le32_to_cpu(fname->hash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { @@ -285,18 +363,23 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } } - de = find_in_block(dir, dentry_page, fname, namehash, - &max_slots, res_page); - if (de) + de = find_in_block(dir, dentry_page, fname, &max_slots); + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; break; + } else if (de) { + *res_page = dentry_page; + break; + } if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); } - if (!de && room && F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; + if (!de && room && F2FS_I(dir)->chash != fname->hash) { + F2FS_I(dir)->chash = fname->hash; F2FS_I(dir)->clevel = level; } @@ -304,23 +387,23 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, - struct fscrypt_name *fname, struct page **res_page) + const struct f2fs_filename *fname, + struct page **res_page) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; unsigned int max_depth; unsigned int level; + *res_page = NULL; + if (f2fs_has_inline_dentry(dir)) { - *res_page = NULL; de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } - if (npages == 0) { - *res_page = NULL; + if (npages == 0) goto out; - } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -331,7 +414,6 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, } for (level = 0; level < max_depth; level++) { - *res_page = NULL; de = find_in_level(dir, level, fname, res_page); if (de || IS_ERR(*res_page)) break; @@ -353,18 +435,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, const struct qstr *child, struct page **res_page) { struct f2fs_dir_entry *de = NULL; - struct fscrypt_name fname; + struct f2fs_filename fname; int err; -#ifdef CONFIG_UNICODE - if (f2fs_has_strict_mode(F2FS_I_SB(dir)) && IS_CASEFOLDED(dir) && - utf8_validate(F2FS_I_SB(dir)->s_encoding, child)) { - *res_page = ERR_PTR(-EINVAL); - return NULL; - } -#endif - - err = fscrypt_setup_filename(dir, child, 1, &fname); + err = f2fs_setup_filename(dir, child, 1, &fname); if (err) { if (err == -ENOENT) *res_page = NULL; @@ -375,15 +449,13 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, de = __f2fs_find_entry(dir, &fname, res_page); - fscrypt_free_filename(&fname); + f2fs_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct qstr dotdot = QSTR_INIT("..", 2); - - return f2fs_find_entry(dir, &dotdot, p); + return f2fs_find_entry(dir, &dotdot_name, p); } ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, @@ -405,6 +477,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); @@ -416,24 +489,47 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_put_page(page, 1); } -static void init_dent_inode(const struct qstr *name, struct page *ipage) +static void init_dent_inode(struct inode *dir, struct inode *inode, + const struct f2fs_filename *fname, + struct page *ipage) { struct f2fs_inode *ri; + if (!fname) /* tmpfile case? */ + return; + f2fs_wait_on_page_writeback(ipage, NODE, true, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); - ri->i_namelen = cpu_to_le32(name->len); - memcpy(ri->i_name, name->name, name->len); + ri->i_namelen = cpu_to_le32(fname->disk_name.len); + memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); + if (IS_ENCRYPTED(dir)) { + file_set_enc_name(inode); + /* + * Roll-forward recovery doesn't have encryption keys available, + * so it can't compute the dirhash for encrypted+casefolded + * filenames. Append it to i_name if possible. Else, disable + * roll-forward recovery of the dentry (i.e., make fsync'ing the + * file force a checkpoint) by setting LOST_PINO. + */ + if (IS_CASEFOLDED(dir)) { + if (fname->disk_name.len + sizeof(f2fs_hash_t) <= + F2FS_NAME_LEN) + put_unaligned(fname->hash, (f2fs_hash_t *) + &ri->i_name[fname->disk_name.len]); + else + file_lost_pino(inode); + } + } set_page_dirty(ipage); } void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { - struct qstr dot = QSTR_INIT(".", 1); - struct qstr dotdot = QSTR_INIT("..", 2); + struct fscrypt_str dot = FSTR_INIT(".", 1); + struct fscrypt_str dotdot = FSTR_INIT("..", 2); /* update dirent of "." */ f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); @@ -467,11 +563,9 @@ static int make_empty_dir(struct inode *inode, } struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *new_name, const struct qstr *orig_name, - struct page *dpage) + const struct f2fs_filename *fname, struct page *dpage) { struct page *page; - int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -494,13 +588,13 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - err = f2fs_init_security(inode, dir, orig_name, page); + err = f2fs_init_security(inode, dir, + fname ? fname->usr_fname : NULL, page); if (err) goto put_error; - if ((IS_ENCRYPTED(dir) || dummy_encrypt) && - f2fs_may_encrypt(inode)) { - err = fscrypt_inherit_context(dir, inode, page, false); + if (IS_ENCRYPTED(inode)) { + err = fscrypt_set_context(inode, page); if (err) goto put_error; } @@ -510,11 +604,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, return page; } - if (new_name) { - init_dent_inode(new_name, page); - if (IS_ENCRYPTED(dir)) - file_set_enc_name(inode); - } + init_dent_inode(dir, inode, fname, page); /* * This file should be checkpointed during fsync. @@ -579,11 +669,11 @@ next: } bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, - struct fscrypt_name *fname) + const struct f2fs_filename *fname) { struct f2fs_dentry_ptr d; unsigned int bit_pos; - int slots = GET_DENTRY_SLOTS(fname_len(fname)); + int slots = GET_DENTRY_SLOTS(fname->disk_name.len); make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); @@ -593,8 +683,8 @@ bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, } void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, - const struct qstr *name, f2fs_hash_t name_hash, - unsigned int bit_pos) + const struct fscrypt_str *name, f2fs_hash_t name_hash, + unsigned int bit_pos) { struct f2fs_dir_entry *de; int slots = GET_DENTRY_SLOTS(name->len); @@ -614,15 +704,13 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, } } -int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, - const struct qstr *orig_name, - struct inode *inode, nid_t ino, umode_t mode) +int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; - f2fs_hash_t dentry_hash; unsigned int nbucket, nblock; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; @@ -631,11 +719,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(dir, new_name, NULL); + slots = GET_DENTRY_SLOTS(fname->disk_name.len); current_depth = F2FS_I(dir)->i_current_depth; - if (F2FS_I(dir)->chash == dentry_hash) { + if (F2FS_I(dir)->chash == fname->hash) { level = F2FS_I(dir)->clevel; F2FS_I(dir)->chash = 0; } @@ -657,7 +744,7 @@ start: nblock = bucket_blocks(level); bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, - (le32_to_cpu(dentry_hash) % nbucket)); + (le32_to_cpu(fname->hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); @@ -680,9 +767,8 @@ add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { - down_write(&F2FS_I(inode)->i_sem); - page = f2fs_init_inode_metadata(inode, dir, new_name, - orig_name, NULL); + f2fs_down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -690,7 +776,8 @@ add_dentry: } make_dentry_ptr_block(NULL, &d, dentry_blk); - f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, + bit_pos); set_page_dirty(dentry_page); @@ -707,28 +794,22 @@ add_dentry: f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); f2fs_put_page(dentry_page, 1); return err; } -int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, - struct inode *inode, nid_t ino, umode_t mode) +int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) { - struct qstr new_name; int err = -EAGAIN; - new_name.name = fname_name(fname); - new_name.len = fname_len(fname); - if (f2fs_has_inline_dentry(dir)) - err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, - inode, ino, mode); + err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); if (err == -EAGAIN) - err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, - inode, ino, mode); + err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; @@ -741,17 +822,17 @@ int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { - struct fscrypt_name fname; + struct f2fs_filename fname; struct page *page = NULL; struct f2fs_dir_entry *de = NULL; int err; - err = fscrypt_setup_filename(dir, name, 0, &fname); + err = f2fs_setup_filename(dir, name, 0, &fname); if (err) return err; /* - * An immature stakable filesystem shows a race condition between lookup + * An immature stackable filesystem shows a race condition between lookup * and create. If we have same task when doing lookup and create, it's * definitely fine as expected by VFS normally. Otherwise, let's just * verify on-disk dentry one more time, which guarantees filesystem @@ -769,7 +850,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, } else { err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } - fscrypt_free_filename(&fname); + f2fs_free_filename(&fname); return err; } @@ -778,8 +859,8 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) struct page *page; int err = 0; - down_write(&F2FS_I(inode)->i_sem); - page = f2fs_init_inode_metadata(inode, dir, NULL, NULL, NULL); + f2fs_down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -789,7 +870,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } @@ -797,7 +878,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); @@ -808,7 +889,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) f2fs_i_links_write(inode, false); f2fs_i_size_write(inode, 0); } - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) f2fs_add_orphan_inode(inode); @@ -850,23 +931,27 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, 0); set_page_dirty(page); - dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir, false); - - if (inode) - f2fs_drop_nlink(dir, inode); - if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); - f2fs_clear_page_private(page); ClearPageUptodate(page); - clear_cold_data(page); + + clear_page_private_gcing(page); + inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); + + detach_page_private(page); + set_page_private(page, 0); } f2fs_put_page(page, 1); + + dir->i_ctime = dir->i_mtime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_dir(struct inode *dir) @@ -916,6 +1001,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra == 1; + bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); @@ -930,13 +1016,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de = &d->dentry[bit_pos]; if (de->name_len == 0) { + if (found_valid_dirent || !bit_pos) { + printk_ratelimited( + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } bit_pos++; ctx->pos = start_pos + bit_pos; - printk_ratelimited( - "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, sbi->sb->s_id, - le32_to_cpu(de->ino)); - set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -953,6 +1041,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } @@ -979,6 +1068,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; + found_valid_dirent = true; } out: if (readdir_ra) @@ -1000,11 +1090,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) int err = 0; if (IS_ENCRYPTED(inode)) { - err = fscrypt_get_encryption_info(inode); + err = fscrypt_prepare_readdir(inode); if (err) goto out; - err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); + err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); if (err < 0) goto out; } @@ -1059,71 +1149,13 @@ out: return err < 0 ? err : 0; } -static int f2fs_dir_open(struct inode *inode, struct file *filp) -{ - if (IS_ENCRYPTED(inode)) - return fscrypt_get_encryption_info(inode) ? -EACCES : 0; - return 0; -} - const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = f2fs_readdir, .fsync = f2fs_sync_file, - .open = f2fs_dir_open, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif }; - -#ifdef CONFIG_UNICODE -static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) -{ - struct qstr qstr = {.name = str, .len = len }; - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *inode = READ_ONCE(parent->d_inode); - - if (!inode || !IS_CASEFOLDED(inode)) { - if (len != name->len) - return -1; - return memcmp(str, name->name, len); - } - - return f2fs_ci_compare(inode, name, &qstr, false); -} - -static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const struct unicode_map *um = sbi->s_encoding; - const struct inode *inode = READ_ONCE(dentry->d_inode); - unsigned char *norm; - int len, ret = 0; - - if (!inode || !IS_CASEFOLDED(inode)) - return 0; - - norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); - if (!norm) - return -ENOMEM; - - len = utf8_casefold(um, str, norm, PATH_MAX); - if (len < 0) { - if (f2fs_has_strict_mode(sbi)) - ret = -EINVAL; - goto out; - } - str->hash = full_name_hash(dentry, norm, len); -out: - kvfree(norm); - return ret; -} - -const struct dentry_operations f2fs_dentry_ops = { - .d_hash = f2fs_d_hash, - .d_compare = f2fs_d_compare, -}; -#endif diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index e60078460ad1..932c070173b9 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -58,6 +58,29 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, return re; } +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *leftmost) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (key < re->key) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + *leftmost = false; + } + } + + return p; +} + struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -166,7 +189,7 @@ lookup_neighbors: } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root) + struct rb_root_cached *root, bool check_key) { #ifdef CONFIG_F2FS_CHECK_FS struct rb_node *cur = rb_first_cached(root), *next; @@ -183,13 +206,23 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, cur_re = rb_entry(cur, struct rb_entry, rb_node); next_re = rb_entry(next, struct rb_entry, rb_node); + if (check_key) { + if (cur_re->key > next_re->key) { + f2fs_info(sbi, "inconsistent rbtree, " + "cur(%llu) next(%llu)", + cur_re->key, next_re->key); + return false; + } + goto next; + } + if (cur_re->ofs + cur_re->len > next_re->ofs) { f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", cur_re->ofs, cur_re->len, next_re->ofs, next_re->len); return false; } - +next: cur = next; } #endif @@ -206,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, { struct extent_node *en; - en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); if (!en) return NULL; @@ -259,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { - et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + et = f2fs_kmem_cache_alloc(extent_tree_slab, + GFP_NOFS, true, NULL); f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; @@ -325,9 +359,10 @@ static void __drop_largest_extent(struct extent_tree *et, } /* return true, if inode page is changed */ -static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; struct extent_tree *et; struct extent_node *en; struct extent_info ei; @@ -335,16 +370,18 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e if (!f2fs_may_extent_tree(inode)) { /* drop largest extent */ if (i_ext && i_ext->len) { + f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; - return true; + set_page_dirty(ipage); + return; } - return false; + return; } et = __grab_extent_tree(inode); if (!i_ext || !i_ext->len) - return false; + return; get_extent_info(&ei, i_ext); @@ -360,17 +397,14 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e } out: write_unlock(&et->lock); - return false; } -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) { - bool ret = __f2fs_init_extent_tree(inode, i_ext); + __f2fs_init_extent_tree(inode, ipage); if (!F2FS_I(inode)->extent_tree) set_inode_flag(inode, FI_NO_EXTENT); - - return ret; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -510,7 +544,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); write_lock(&et->lock); @@ -549,7 +583,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, org_end = dei.fofs + dei.len; f2fs_bug_on(sbi, pos >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { en->ei.len = pos - en->ei.fofs; prev_en = en; parts = 1; @@ -628,6 +662,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } +#ifdef CONFIG_F2FS_FS_COMPRESSION +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + bool leftmost = false; + + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); + + /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return; + + write_lock(&et->lock); + + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false, + &leftmost); + if (en) + goto unlock_out; + + set_extent_info(&ei, fofs, blkaddr, llen); + ei.c_len = c_len; + + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +unlock_out: + write_unlock(&et->lock); +} +#endif + unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *et, *next; @@ -729,9 +804,8 @@ void f2fs_drop_extent_tree(struct inode *inode) if (!f2fs_may_extent_tree(inode)) return; - set_inode_flag(inode, FI_NO_EXTENT); - write_lock(&et->lock); + set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); if (et->largest.len) { et->largest.len = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5355be6b6755..e6355a5683b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/f2fs.h * @@ -18,24 +18,26 @@ #include <linux/kobject.h> #include <linux/sched.h> #include <linux/cred.h> +#include <linux/sched/mm.h> #include <linux/vmalloc.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/quotaops.h> +#include <linux/part_stat.h> #include <crypto/hash.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> +struct pagevec; + #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else #define f2fs_bug_on(sbi, condition) \ do { \ - if (unlikely(condition)) { \ - WARN_ON(1); \ + if (WARN_ON(condition)) \ set_sbi_flag(sbi, SBI_NEED_FSCK); \ - } \ } while (0) #endif @@ -44,7 +46,7 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, - FAULT_ALLOC_BIO, + FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -55,6 +57,9 @@ enum { FAULT_CHECKPOINT, FAULT_DISCARD, FAULT_WRITE_IO, + FAULT_SLAB_ALLOC, + FAULT_DQUOT_INIT, + FAULT_LOCK_OP, FAULT_MAX, }; @@ -74,7 +79,6 @@ extern const char *f2fs_fault_name[FAULT_MAX]; /* * For mount options */ -#define F2FS_MOUNT_BG_GC 0x00000001 #define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 #define F2FS_MOUNT_DISCARD 0x00000004 #define F2FS_MOUNT_NOHEAP 0x00000008 @@ -88,11 +92,8 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 -#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 -#define F2FS_MOUNT_ADAPTIVE 0x00020000 -#define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 @@ -100,6 +101,11 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 +#define F2FS_MOUNT_NORECOVERY 0x04000000 +#define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 +#define F2FS_MOUNT_GC_MERGE 0x20000000 +#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -118,6 +124,20 @@ typedef u32 nid_t; #define COMPRESS_EXT_NUM 16 +/* + * An implementation of an rwsem that is explicitly unfair to readers. This + * prevents priority inversion when a low-priority reader acquires the read lock + * while sleeping on the write lock but the write lock is needed by + * higher-priority clients. + */ + +struct f2fs_rwsem { + struct rw_semaphore internal_rwsem; +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_queue_head_t read_waiters; +#endif +}; + struct f2fs_mount_info { unsigned int opt; int write_io_size_bits; /* Write IO size bits */ @@ -135,19 +155,32 @@ struct f2fs_mount_info { int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ - int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ - bool test_dummy_encryption; /* test dummy encryption */ + int fs_mode; /* fs mode: LFS or ADAPTIVE */ + int bggc_mode; /* bggc mode: off, on or sync */ + int memory_mode; /* memory mode */ + int discard_unit; /* + * discard command's offset/size should + * be aligned to this unit: block, + * segment or section + */ + struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ + block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ /* For compression */ unsigned char compress_algorithm; /* algorithm type */ - unsigned compress_log_size; /* cluster log size */ + unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ + bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ + unsigned char nocompress_ext_cnt; /* nocompress extension count */ + int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -164,6 +197,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_FEATURE_CASEFOLD 0x1000 #define F2FS_FEATURE_COMPRESSION 0x2000 +#define F2FS_FEATURE_RO 0x4000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -194,8 +228,8 @@ enum { #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 #define CP_PAUSE 0x00000040 +#define CP_RESIZE 0x00000080 -#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ @@ -232,6 +266,10 @@ enum { * condition of read on truncated area * by extent_cache */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ META_GENERIC, }; @@ -240,7 +278,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ - TRANS_DIR_INO, /* for trasactions dir ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -263,6 +301,26 @@ struct fsync_node_entry { unsigned int seq_id; /* sequence id */ }; +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -331,8 +389,8 @@ struct discard_policy { bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ + bool timeout; /* discard timeout for put_super */ unsigned int granularity; /* discard granularity */ - int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -346,6 +404,10 @@ struct discard_cmd_control { struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int max_discard_request; /* max. discard request per round */ + unsigned int min_discard_issue_time; /* min. interval between discard issue */ + unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ + unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ @@ -399,88 +461,6 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, return size <= MAX_SIT_JENTRIES(journal); } -/* - * ioctl commands - */ -#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS -#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS -#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION - -#define F2FS_IOCTL_MAGIC 0xf5 -#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) -#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) -#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) -#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) -#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) -#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ - struct f2fs_defragment) -#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ - struct f2fs_move_range) -#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ - struct f2fs_flush_device) -#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ - struct f2fs_gc_range) -#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) -#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) -#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) -#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) -#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) - -#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL -#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL - -#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY -#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY -#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT - -/* - * should be same as XFS_IOC_GOINGDOWN. - * Flags for going down operation used by FS_IOC_GOINGDOWN - */ -#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ -#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ -#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ -#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ -#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ -#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ - -#if defined(__KERNEL__) && defined(CONFIG_COMPAT) -/* - * ioctl commands in 32 bit emulation - */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS -#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION -#endif - -#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR -#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR - -struct f2fs_gc_range { - u32 sync; - u64 start; - u64 len; -}; - -struct f2fs_defragment { - u64 start; - u64 len; -}; - -struct f2fs_move_range { - u32 dst_fd; /* destination fd */ - u64 pos_in; /* start position in src_fd */ - u64 pos_out; /* start position in dst_fd */ - u64 len; /* size to move */ -}; - -struct f2fs_flush_device { - u32 dev_num; /* device number to flush */ - u32 segments; /* # of segments to flush */ -}; - /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); @@ -505,6 +485,44 @@ static inline int get_inline_xattr_addrs(struct inode *inode); * For INODE and NODE manager */ /* for directory operations */ + +struct f2fs_filename { + /* + * The filename the user specified. This is NULL for some + * filesystem-internal operations, e.g. converting an inline directory + * to a non-inline one, or roll-forward recovering an encrypted dentry. + */ + const struct qstr *usr_fname; + + /* + * The on-disk filename. For encrypted directories, this is encrypted. + * This may be NULL for lookups in an encrypted dir without the key. + */ + struct fscrypt_str disk_name; + + /* The dirhash of this filename */ + f2fs_hash_t hash; + +#ifdef CONFIG_FS_ENCRYPTION + /* + * For lookups in encrypted directories: either the buffer backing + * disk_name, or a buffer that holds the decoded no-key name. + */ + struct fscrypt_str crypto_buf; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + /* + * For casefolded directories: the casefolded name, but it's left NULL + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. + */ + struct fscrypt_str cf_name; +#endif +}; + struct f2fs_dentry_ptr { struct inode *inode; void *bitmap; @@ -557,31 +575,53 @@ enum { */ }; -#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ + +/* congestion wait timeout value, default: 20ms */ +#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ +/* dirty segments threshold for triggering CP */ +#define DEFAULT_DIRTY_THRESHOLD 4 + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ + union { + struct { + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ + }; + unsigned long long key; /* 64-bits key */ + } __packed; }; struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ u32 blk; /* start block address of the extent */ +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned int c_len; /* physical extent length of compressed blocks */ +#endif }; struct extent_node { @@ -614,6 +654,7 @@ struct extent_tree { F2FS_MAP_UNWRITTEN) struct f2fs_map_blocks { + struct block_device *m_bdev; /* for multi-device dio */ block_t m_pblk; block_t m_lblk; unsigned int m_len; @@ -622,6 +663,7 @@ struct f2fs_map_blocks { pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; bool m_may_create; /* indicate it is from write path */ + bool m_multidev_dio; /* indicate it allows multi-device dio */ }; /* for flag in get_data_block */ @@ -645,36 +687,86 @@ enum { #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) + +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) #define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) -#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) + #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) + #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + #define DEF_DIR_LEVEL 0 enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ + FI_DROP_CACHE, /* drop dirty page cache */ + FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_SKIP_WRITES, /* should skip data page writeback */ + FI_OPU_WRITE, /* used for opu per file */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ + FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ + FI_MMAP_FILE, /* indicate file was mmapped */ + FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ + FI_COMPRESS_RELEASED, /* compressed blocks were released */ + FI_ALIGNED_WRITE, /* enable aligned write */ + FI_COW_FILE, /* indicate COW file */ + FI_MAX, /* max flag, never be used */ +}; + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ @@ -687,15 +779,17 @@ struct f2fs_inode_info { umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ - unsigned long flags; /* use to pass per-file flags */ - struct rw_semaphore i_sem; /* protect fi info */ + unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ + struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ + spinlock_t i_size_lock; /* protect last_disk_size */ #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; @@ -705,16 +799,13 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ - struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_mmap_sem; - struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + struct f2fs_rwsem i_gc_rwsem[2]; + struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ @@ -723,10 +814,14 @@ struct f2fs_inode_info { struct timespec64 i_disk_time[4];/* inode disk times */ /* for file compress */ - u64 i_compr_blocks; /* # of compressed blocks */ + atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ + unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ + + unsigned int atomic_write_cnt; }; static inline void get_extent_info(struct extent_info *ext, @@ -751,6 +846,9 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->fofs = fofs; ei->blk = blk; ei->len = len; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif } static inline bool __is_discard_mergeable(struct discard_info *back, @@ -775,6 +873,12 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif return (back->fofs + back->len == front->fofs && back->blk + back->len == front->blk); } @@ -810,11 +914,19 @@ enum nid_state { MAX_NID_STATE, }; +enum nat_state { + TOTAL_NAT, + DIRTY_NAT, + RECLAIMABLE_NAT, + MAX_NAT_STATE, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + nid_t max_rf_node_blocks; /* max # of nodes for recovery */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ @@ -822,11 +934,10 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ - unsigned int nat_cnt; /* the # of cached nat entries */ - unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ @@ -895,7 +1006,10 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) -#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_RO_TYPE (2) +#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) enum { CURSEG_HOT_DATA = 0, /* directory entry blocks */ @@ -904,8 +1018,11 @@ enum { CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ - NO_CHECK_TYPE, - CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */ + NR_PERSISTENT_LOG, /* number of persistent log */ + CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, + /* pinned file that needs consecutive block address */ + CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ + NO_CHECK_TYPE, /* number of persistent & inmem log */ }; struct flush_cmd { @@ -930,7 +1047,7 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct rw_semaphore curseg_lock; /* for preventing curseg change */ + struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ @@ -939,6 +1056,7 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ + unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -979,7 +1097,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1004,16 +1121,12 @@ enum count_type { */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1045,11 +1158,15 @@ enum cp_reason_type { }; enum iostat_type { - APP_DIRECT_IO, /* app direct IOs */ - APP_BUFFERED_IO, /* app buffered IOs */ + /* WRITE IO */ + APP_DIRECT_IO, /* app direct write IOs */ + APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ @@ -1057,6 +1174,21 @@ enum iostat_type { FS_CP_DATA_IO, /* data IOs from checkpoint */ FS_CP_NODE_IO, /* node IOs from checkpoint */ FS_CP_META_IO, /* meta IOs from checkpoint */ + + /* READ IO */ + APP_DIRECT_READ_IO, /* app direct read IOs */ + APP_BUFFERED_READ_IO, /* app buffered read IOs */ + APP_READ_IO, /* app read IOs */ + APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ + FS_DATA_READ_IO, /* data read IOs */ + FS_GDATA_READ_IO, /* data read IOs from background gc */ + FS_CDATA_READ_IO, /* compressed data read IOs */ + FS_NODE_READ_IO, /* node read IOs */ + FS_META_READ_IO, /* meta read IOs */ + + /* other */ FS_DISCARD, /* discard */ NR_IO_TYPE, }; @@ -1066,8 +1198,8 @@ struct f2fs_io_info { nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ - int op; /* contains REQ_OP_ */ - int op_flags; /* req_flag_bits */ + enum req_op op; /* contains REQ_OP_ */ + blk_opf_t op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ @@ -1081,6 +1213,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ int compr_blocks; /* # of compressed block addresses */ bool encrypted; /* indicate file is encrypted */ + bool post_read; /* require post read */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ @@ -1099,11 +1232,11 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct rw_semaphore io_rwsem; /* blocking op for bio */ + struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ struct list_head bio_list; /* bio entry list head */ - struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */ + struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) @@ -1124,7 +1257,6 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ - ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1136,6 +1268,27 @@ struct inode_management { unsigned long ino_num; /* number of entries */ }; +/* for GC_AT */ +struct atgc_management { + bool atgc_enabled; /* ATGC is enabled or not */ + struct rb_root_cached root; /* root of victim rb-tree */ + struct list_head victim_list; /* linked with all victim entries */ + unsigned int victim_count; /* victim count in rb-tree */ + unsigned int candidate_ratio; /* candidate ratio */ + unsigned int max_candidate_count; /* max candidate count */ + unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ + unsigned long long age_threshold; /* age threshold */ +}; + +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1152,6 +1305,7 @@ enum { SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ + SBI_IS_FREEZING, /* freezefs is in process */ }; enum { @@ -1168,13 +1322,27 @@ enum { GC_NORMAL, GC_IDLE_CB, GC_IDLE_GREEDY, - GC_URGENT, + GC_IDLE_AT, + GC_URGENT_HIGH, + GC_URGENT_LOW, + GC_URGENT_MID, + MAX_GC_MODE, }; enum { - WHINT_MODE_OFF, /* not pass down write hints */ - WHINT_MODE_USER, /* try to pass down hints given by users */ - WHINT_MODE_FS, /* pass down hints with F2FS policy */ + BGGC_MODE_ON, /* background gc is on */ + BGGC_MODE_OFF, /* background gc is off */ + BGGC_MODE_SYNC, /* + * background gc is on, migrating blocks + * like foreground gc + */ +}; + +enum { + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ + FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; enum { @@ -1188,36 +1356,166 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; +enum { + COMPR_MODE_FS, /* + * automatically compress compression + * enabled files + */ + COMPR_MODE_USER, /* + * automatical compression is disabled. + * user can control the file compression + * using ioctls + */ +}; + +enum { + DISCARD_UNIT_BLOCK, /* basic discard unit is block */ + DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ + DISCARD_UNIT_SECTION, /* basic discard unit is section */ +}; + +enum { + MEMORY_MODE_NORMAL, /* memory mode for normal devices */ + MEMORY_MODE_LOW, /* memory mode for low memry devices */ +}; + + + +static inline int f2fs_test_bit(unsigned int nr, char *addr); +static inline void f2fs_set_bit(unsigned int nr, char *addr); +static inline void f2fs_clear_bit(unsigned int nr, char *addr); + /* - * this value is set in page as a private data which indicate that - * the page is atomically written, and it is in inmem_pages list. + * Layout of f2fs page.private: + * + * Layout A: lowest bit should be 1 + * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | + * bit 0 PAGE_PRIVATE_NOT_POINTER + * bit 1 PAGE_PRIVATE_ATOMIC_WRITE + * bit 2 PAGE_PRIVATE_DUMMY_WRITE + * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 4 PAGE_PRIVATE_INLINE_INODE + * bit 5 PAGE_PRIVATE_REF_RESOURCE + * bit 6- f2fs private data + * + * Layout B: lowest bit should be 0 + * page.private is a wrapped pointer. */ -#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) -#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) +enum { + PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ + PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ + PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ + PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ + PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ + PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ + PAGE_PRIVATE_MAX +}; -#define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) -#define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} -#ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) -#else -#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) -#endif +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) { \ + get_page(page); \ + SetPagePrivate(page); \ + set_page_private(page, 0); \ + } \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ + set_page_private(page, 0); \ + if (PagePrivate(page)) { \ + ClearPagePrivate(page); \ + put_page(page); \ + }\ + } \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); + +static inline unsigned long get_page_private_data(struct page *page) +{ + unsigned long data = page_private(page); + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; +} + +static inline void set_page_private_data(struct page *page, unsigned long data) +{ + if (!PagePrivate(page)) { + get_page(page); + SetPagePrivate(page); + set_page_private(page, 0); + } + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); + page_private(page) |= data << PAGE_PRIVATE_MAX; +} + +static inline void clear_page_private_data(struct page *page) +{ + page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { + set_page_private(page, 0); + if (PagePrivate(page)) { + ClearPagePrivate(page); + put_page(page); + } + } +} /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, COMPRESS_LZ4, + COMPRESS_ZSTD, + COMPRESS_LZORLE, COMPRESS_MAX, }; +enum compress_flag { + COMPRESS_CHKSUM, + COMPRESS_MAX_FLAG, +}; + +#define COMPRESS_WATERMARK 20 +#define COMPRESS_PERCENT 20 + #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ - __le32 chksum; /* checksum of compressed data */ + __le32 chksum; /* compressed data chksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -1226,6 +1524,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define COMPRESS_LEVEL_OFFSET 8 + /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ @@ -1236,11 +1536,13 @@ struct compress_ctx { unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ void *private; /* payload buffer for specified compression algorithm */ + void *private2; /* extra payload buffer */ }; /* compress context for write IO path */ @@ -1249,10 +1551,10 @@ struct compress_io_ctx { struct inode *inode; /* inode the context belong to */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ - refcount_t ref; /* referrence count of raw page */ + atomic_t pending_pages; /* in-flight compressed page count */ }; -/* decompress io context for read IO path */ +/* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ @@ -1268,26 +1570,53 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - refcount_t ref; /* referrence count of compressed page */ - bool failed; /* indicate IO error during decompression */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ + void *private; /* payload buffer for specified decompression algorithm */ + void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ + struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 +#define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - struct rw_semaphore sb_lock; /* lock for raw super block */ + struct f2fs_rwsem sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ -#ifdef CONFIG_UNICODE - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ @@ -1304,23 +1633,26 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ /* keep migration IO order for LFS mode */ - struct rw_semaphore io_order_lock; + struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* checkpoint procedure lock */ - struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct rw_semaphore node_write; /* locking node writes */ - struct rw_semaphore node_change; /* locking node change */ + struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ + struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ + struct f2fs_rwsem node_write; /* locking node writes */ + struct f2fs_rwsem node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ - struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ spinlock_t fsync_node_lock; /* for node entry lock */ struct list_head fsync_node_list; /* node list head */ @@ -1354,15 +1686,15 @@ struct f2fs_sb_info { unsigned int meta_ino_num; /* meta inode number*/ unsigned int log_blocks_per_seg; /* log2 blocks per segment */ unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ - struct mutex resize_mutex; /* for resize exclusion */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ + u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -1375,12 +1707,14 @@ struct f2fs_sb_info { block_t unusable_block_count; /* # of blocks saved by last cp */ unsigned int nquota_files; /* # of quota sysfile */ - struct rw_semaphore quota_sem; /* blocking cp for flags */ + struct f2fs_rwsem quota_sem; /* blocking cp for flags */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* # of node block writes as roll forward recovery */ + struct percpu_counter rf_node_block_count; /* writeback control */ atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ @@ -1391,22 +1725,24 @@ struct f2fs_sb_info { struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct rw_semaphore gc_lock; /* + struct f2fs_rwsem gc_lock; /* * semaphore for GC, avoid * race between GC and GC or CP */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + struct atgc_management am; /* atgc management */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_urgent_high_lock; + unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ + /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; - struct rw_semaphore pin_sem; + struct f2fs_rwsem pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -1431,33 +1767,41 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ - atomic_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ + atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif spinlock_t stat_lock; /* lock for stat operations */ - /* For app/fs IO statistics */ - spinlock_t iostat_lock; - unsigned long long write_iostat[NR_IO_TYPE]; - bool iostat_enable; + /* to attach REQ_META|REQ_FUA flags */ + unsigned int data_io_flag; + unsigned int node_io_flag; - /* For sysfs suppport */ - struct kobject s_kobj; + /* For sysfs support */ + struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ struct completion s_kobj_unregister; + struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ + struct completion s_stat_kobj_unregister; + + struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */ + struct completion s_feature_list_kobj_unregister; + /* For shrinker support */ struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For multi devices */ int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ - struct mutex umount_mutex; - unsigned int shrinker_run_no; + bool aligned_blksize; /* all devices has the same logical blksize */ /* For write statistics */ u64 sectors_written_start; @@ -1470,13 +1814,58 @@ struct f2fs_sb_info { __u32 s_chksum_seed; struct workqueue_struct *post_read_wq; /* post read workqueue */ -}; -struct f2fs_private_dio { - struct inode *inode; - void *orig_private; - bio_end_io_t *orig_end_io; - bool write; + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ + unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + + /* For reclaimed segs statistics per each GC mode */ + unsigned int gc_segment_mode; /* GC state for reclaimed segments */ + unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + + unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + + int max_fragment_chunk; /* max chunk size for block fragmentation mode */ + int max_fragment_hole; /* max hole size for block fragmentation mode */ + + /* For atomic write statistics */ + atomic64_t current_atomic_write; + s64 peak_atomic_write; + u64 committed_atomic_block; + u64 revoked_atomic_block; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct kmem_cache *page_array_slab; /* page array entry */ + unsigned int page_array_slab_size; /* default page array slab size */ + + /* For runtime compression statistics */ + u64 compr_written_block; + u64 compr_saved_block; + u32 compr_new_inode; + + /* For compressed block cache */ + struct inode *compress_inode; /* cache compressed blocks */ + unsigned int compress_percent; /* cache page percentage */ + unsigned int compress_watermark; /* cache page watermark */ + atomic_t compress_page_hit; /* cache hit count */ +#endif + +#ifdef CONFIG_F2FS_IOSTAT + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long rw_iostat[NR_IO_TYPE]; + unsigned long long prev_rw_iostat[NR_IO_TYPE]; + bool iostat_enable; + unsigned long iostat_next_period; + unsigned int iostat_period_ms; + + /* For io latency related statistics info in one iostat period */ + spinlock_t iostat_lat_lock; + struct iostat_lat_info *iostat_io_lat; +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1521,13 +1910,6 @@ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) return sbi->s_ndevs > 1; } -/* For write statistics. Suppose sector size is 512 bytes, - * and the return value is in kbytes. s is of struct f2fs_sb_info. - */ -#define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ - (s)->sectors_written_start) >> 1) - static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { unsigned long now = jiffies; @@ -1764,59 +2146,105 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } -static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +#define init_f2fs_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_f2fs_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key) { - unsigned long flags; - unsigned char *nat_bits; + __init_rwsem(&sem->internal_rwsem, sem_name, key); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + init_waitqueue_head(&sem->read_waiters); +#endif +} - /* - * In order to re-enable nat_bits we need to call fsck.f2fs by - * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, - * so let's rely on regular fsck or unclean shutdown. - */ +static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) +{ + return rwsem_is_locked(&sem->internal_rwsem); +} + +static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) +{ + return rwsem_is_contended(&sem->internal_rwsem); +} + +static inline void f2fs_down_read(struct f2fs_rwsem *sem) +{ +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); +#else + down_read(&sem->internal_rwsem); +#endif +} - if (lock) - spin_lock_irqsave(&sbi->cp_lock, flags); - __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); - nat_bits = NM_I(sbi)->nat_bits; - NM_I(sbi)->nat_bits = NULL; - if (lock) - spin_unlock_irqrestore(&sbi->cp_lock, flags); +static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) +{ + return down_read_trylock(&sem->internal_rwsem); +} - kvfree(nat_bits); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); } +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#endif -static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, - struct cp_control *cpc) +static inline void f2fs_up_read(struct f2fs_rwsem *sem) { - bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + up_read(&sem->internal_rwsem); +} - return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +static inline void f2fs_down_write(struct f2fs_rwsem *sem) +{ + down_write(&sem->internal_rwsem); +} + +static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) +{ + return down_write_trylock(&sem->internal_rwsem); +} + +static inline void f2fs_up_write(struct f2fs_rwsem *sem) +{ + up_write(&sem->internal_rwsem); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wake_up_all(&sem->read_waiters); +#endif } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - down_read(&sbi->cp_rwsem); + f2fs_down_read(&sbi->cp_rwsem); } static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { - return down_read_trylock(&sbi->cp_rwsem); + if (time_to_inject(sbi, FAULT_LOCK_OP)) { + f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + return 0; + } + return f2fs_down_read_trylock(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - up_read(&sbi->cp_rwsem); + f2fs_up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write(&sbi->cp_rwsem); + f2fs_down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - up_write(&sbi->cp_rwsem); + f2fs_up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) @@ -1906,6 +2334,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + avail_user_block_count -= sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; @@ -2018,6 +2451,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } +static inline void inc_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + u64 current_write; + + fi->atomic_write_cnt++; + atomic64_inc(&sbi->current_atomic_write); + current_write = atomic64_read(&sbi->current_atomic_write); + if (current_write > sbi->peak_atomic_write) + sbi->peak_atomic_write = current_write; +} + +static inline void release_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + + atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write); + fi->atomic_write_cnt = 0; +} + static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { return atomic_read(&sbi->nr_pages[count_type]); @@ -2068,6 +2523,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + void *tmp_ptr = &ckpt->sit_nat_version_bitmap; int offset; if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { @@ -2077,18 +2533,18 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) * if large_nat_bitmap feature is enabled, leave checksum * protection for all nat/sit bitmaps. */ - return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); + return tmp_ptr + offset + sizeof(__le32); } if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) - return &ckpt->sit_nat_version_bitmap; + return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return tmp_ptr + offset; } } @@ -2151,6 +2607,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + valid_block_count += sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + user_block_count = sbi->user_block_count; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) user_block_count -= sbi->unusable_block_count; @@ -2195,11 +2656,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_block_count); - f2fs_bug_on(sbi, !sbi->total_valid_node_count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - sbi->total_valid_node_count--; - sbi->total_valid_block_count--; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; @@ -2210,7 +2677,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, dquot_free_inode(inode); } else { if (unlikely(inode->i_blocks == 0)) { - f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu", + f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -2244,6 +2711,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { struct page *page; + unsigned int flags; if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { if (!for_write) @@ -2263,7 +2731,12 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, if (!for_write) return grab_cache_page(mapping, index); - return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + + flags = memalloc_nofs_save(); + page = grab_cache_page_write_begin(mapping, index); + memalloc_nofs_restore(flags); + + return page; } static inline struct page *f2fs_pagecache_get_page( @@ -2278,16 +2751,6 @@ static inline struct page *f2fs_pagecache_get_page( return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } -static inline void f2fs_copy_page(struct page *src, struct page *dst) -{ - char *src_kaddr = kmap(src); - char *dst_kaddr = kmap(dst); - - memcpy(dst_kaddr, src_kaddr, PAGE_SIZE); - kunmap(dst); - kunmap(src); -} - static inline void f2fs_put_page(struct page *page, int unlock) { if (!page) @@ -2316,7 +2779,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } -static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, +static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, gfp_t flags) { void *entry; @@ -2327,26 +2790,54 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) { - if (sbi->gc_mode == GC_URGENT) - return true; + if (nofail) + return f2fs_kmem_cache_alloc_nofail(cachep, flags); + + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC); + return NULL; + } + return kmem_cache_alloc(cachep, flags); +} + +static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) +{ if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || get_pages(sbi, F2FS_DIO_WRITE)) - return false; + return true; if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info && atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) - return false; + return true; if (SM_I(sbi) && SM_I(sbi)->fcc_info && atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return true; + return false; +} + +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + + if (is_inflight_io(sbi, type)) return false; + if (sbi->gc_mode == GC_URGENT_MID) + return true; + + if (sbi->gc_mode == GC_URGENT_LOW && + (type == DISCARD_TIME || type == GC_TIME)) + return true; + return f2fs_time_over(sbi, type); } @@ -2378,7 +2869,7 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node) } static inline int f2fs_has_extra_attr(struct inode *inode); -static inline block_t datablock_addr(struct inode *inode, +static inline block_t data_blkaddr(struct inode *inode, struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; @@ -2388,9 +2879,9 @@ static inline block_t datablock_addr(struct inode *inode, raw_node = F2FS_NODE(node_page); - /* from GC path only */ if (is_inode) { if (!inode) + /* from GC path only */ base = offset_in_addr(&raw_node->i); else if (f2fs_has_extra_attr(inode)) base = get_extra_isize(inode); @@ -2400,6 +2891,11 @@ static inline block_t datablock_addr(struct inode *inode, return le32_to_cpu(addr_array[base + offset]); } +static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) +{ + return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node); +} + static inline int f2fs_test_bit(unsigned int nr, char *addr) { int mask; @@ -2497,43 +2993,6 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) return flags & F2FS_OTHER_FLMASK; } -/* used for f2fs_inode_info->flags */ -enum { - FI_NEW_INODE, /* indicate newly allocated inode */ - FI_DIRTY_INODE, /* indicate inode is dirty or not */ - FI_AUTO_RECOVER, /* indicate inode is recoverable */ - FI_DIRTY_DIR, /* indicate directory has dirty pages */ - FI_INC_LINK, /* need to increment i_nlink */ - FI_ACL_MODE, /* indicate acl mode */ - FI_NO_ALLOC, /* should not allocate any blocks */ - FI_FREE_NID, /* free allocated nide */ - FI_NO_EXTENT, /* not to use the extent cache */ - FI_INLINE_XATTR, /* used for inline xattr */ - FI_INLINE_DATA, /* used for inline data*/ - FI_INLINE_DENTRY, /* used for inline dentry */ - FI_APPEND_WRITE, /* inode has appended data */ - FI_UPDATE_WRITE, /* inode has in-place-update data */ - FI_NEED_IPU, /* used for ipu per file */ - FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ - FI_VOLATILE_FILE, /* indicate volatile file */ - FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ - FI_DROP_CACHE, /* drop dirty page cache */ - FI_DATA_EXIST, /* indicate data exists */ - FI_INLINE_DOTS, /* indicate inline dot dentries */ - FI_DO_DEFRAG, /* indicate defragment is running */ - FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ - FI_HOT_DATA, /* indicate file is hot */ - FI_EXTRA_ATTR, /* indicate file has extra attribute */ - FI_PROJ_INHERIT, /* indicate file inherits projectid */ - FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ - FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ - FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ - FI_MMAP_FILE, /* indicate file was mmapped */ -}; - static inline void __mark_inode_dirty_flag(struct inode *inode, int flag, bool set) { @@ -2544,31 +3003,29 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_NEW_INODE: if (set) return; - /* fall through */ + fallthrough; case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: - case FI_COMPRESSED_FILE: + case FI_COMPRESS_RELEASED: f2fs_mark_inode_dirty_sync(inode, true); } } static inline void set_inode_flag(struct inode *inode, int flag) { - if (!test_bit(flag, &F2FS_I(inode)->flags)) - set_bit(flag, &F2FS_I(inode)->flags); + set_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, true); } static inline int is_inode_flag_set(struct inode *inode, int flag) { - return test_bit(flag, &F2FS_I(inode)->flags); + return test_bit(flag, F2FS_I(inode)->flags); } static inline void clear_inode_flag(struct inode *inode, int flag) { - if (test_bit(flag, &F2FS_I(inode)->flags)) - clear_bit(flag, &F2FS_I(inode)->flags); + clear_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, false); } @@ -2659,19 +3116,21 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) struct f2fs_inode_info *fi = F2FS_I(inode); if (ri->i_inline & F2FS_INLINE_XATTR) - set_bit(FI_INLINE_XATTR, &fi->flags); + set_bit(FI_INLINE_XATTR, fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_bit(FI_INLINE_DATA, &fi->flags); + set_bit(FI_INLINE_DATA, fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_bit(FI_INLINE_DENTRY, &fi->flags); + set_bit(FI_INLINE_DENTRY, fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_bit(FI_DATA_EXIST, &fi->flags); + set_bit(FI_DATA_EXIST, fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_bit(FI_INLINE_DOTS, &fi->flags); + set_bit(FI_INLINE_DOTS, fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) - set_bit(FI_EXTRA_ATTR, &fi->flags); + set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) - set_bit(FI_PIN_FILE, &fi->flags); + set_bit(FI_PIN_FILE, fi->flags); + if (ri->i_inline & F2FS_COMPRESS_RELEASED) + set_bit(FI_COMPRESS_RELEASED, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2692,6 +3151,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_EXTRA_ATTR; if (is_inode_flag_set(inode, FI_PIN_FILE)) ri->i_inline |= F2FS_PIN_FILE; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + ri->i_inline |= F2FS_COMPRESS_RELEASED; } static inline int f2fs_has_extra_attr(struct inode *inode) @@ -2710,6 +3171,22 @@ static inline int f2fs_compressed_file(struct inode *inode) is_inode_flag_set(inode, FI_COMPRESSED_FILE); } +static inline bool f2fs_need_compress_data(struct inode *inode) +{ + int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode; + + if (!f2fs_compressed_file(inode)) + return false; + + if (compress_mode == COMPR_MODE_FS) + return true; + else if (compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; + + return false; +} + static inline unsigned int addrs_per_inode(struct inode *inode) { unsigned int addrs = CUR_ADDRS_PER_INODE(inode) - @@ -2742,6 +3219,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); @@ -2772,14 +3253,9 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +static inline bool f2fs_is_cow_file(struct inode *inode) { - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_COW_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) @@ -2812,12 +3288,16 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { + if (is_file(inode, type)) + return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { + if (!is_file(inode, type)) + return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } @@ -2856,9 +3336,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) if (!f2fs_is_time_consistent(inode)) return false; - down_read(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); - up_read(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); return ret; } @@ -2873,51 +3353,26 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } -static inline bool is_dot_dotdot(const struct qstr *str) +static inline bool is_dot_dotdot(const u8 *name, size_t len) { - if (str->len == 1 && str->name[0] == '.') + if (len == 1 && name[0] == '.') return true; - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + if (len == 2 && name[0] == '.' && name[1] == '.') return true; return false; } -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { - void *ret; - if (time_to_inject(sbi, FAULT_KMALLOC)) { f2fs_show_injection_info(sbi, FAULT_KMALLOC); return NULL; } - ret = kmalloc(size, flags); - if (ret) - return ret; - - return kvmalloc(size, flags); + return kmalloc(size, flags); } static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, @@ -2967,31 +3422,6 @@ static inline int get_inline_xattr_addrs(struct inode *inode) sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ -static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) -{ - int i; - - spin_lock(&sbi->iostat_lock); - for (i = 0; i < NR_IO_TYPE; i++) - sbi->write_iostat[i] = 0; - spin_unlock(&sbi->iostat_lock); -} - -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, - enum iostat_type type, unsigned long long io_bytes) -{ - if (!sbi->iostat_enable) - return; - spin_lock(&sbi->iostat_lock); - sbi->write_iostat[type] += io_bytes; - - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->write_iostat[APP_BUFFERED_IO] = - sbi->write_iostat[APP_WRITE_IO] - - sbi->write_iostat[APP_DIRECT_IO]; - spin_unlock(&sbi->iostat_lock); -} - #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) @@ -3016,40 +3446,24 @@ static inline bool __is_valid_data_blkaddr(block_t blkaddr) return true; } -static inline void f2fs_set_page_private(struct page *page, - unsigned long data) -{ - if (PagePrivate(page)) - return; - - get_page(page); - SetPagePrivate(page); - set_page_private(page, data); -} - -static inline void f2fs_clear_page_private(struct page *page) -{ - if (!PagePrivate(page)) - return; - - set_page_private(page, 0); - ClearPagePrivate(page); - f2fs_put_page(page, 0); -} - /* * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); -int f2fs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); +int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int f2fs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); @@ -3076,32 +3490,35 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); - -extern int f2fs_ci_compare(const struct inode *parent, - const struct qstr *name, - const struct qstr *entry, - bool quick); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); /* * dir.c */ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); -struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, - f2fs_hash_t namehash, int *max_slots, - struct f2fs_dentry_ptr *d); +int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname); +int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname); +int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct f2fs_filename *fname); +void f2fs_free_filename(struct f2fs_filename *fname); +struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, + const struct f2fs_filename *fname, int *max_slots); int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr); void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d); struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *new_name, - const struct qstr *orig_name, struct page *dpage); + const struct f2fs_filename *fname, struct page *dpage); void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth); int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); void f2fs_drop_nlink(struct inode *dir, struct inode *inode); struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, - struct fscrypt_name *fname, struct page **res_page); + const struct f2fs_filename *fname, + struct page **res_page); struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, const struct qstr *child, struct page **res_page); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); @@ -3110,14 +3527,13 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, - struct fscrypt_name *fname); + const struct f2fs_filename *fname); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, - const struct qstr *name, f2fs_hash_t name_hash, + const struct fscrypt_str *name, f2fs_hash_t name_hash, unsigned int bit_pos); -int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, - const struct qstr *orig_name, +int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); -int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, +int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); @@ -3128,6 +3544,8 @@ bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } @@ -3137,9 +3555,13 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3147,13 +3569,11 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct inode *dir, - const struct qstr *name_info, struct fscrypt_name *fname); +void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ -struct dnode_of_data; struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); @@ -3166,13 +3586,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni); + struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -3180,6 +3601,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); @@ -3191,11 +3613,12 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); -void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page); int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); @@ -3206,19 +3629,17 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); -void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); @@ -3229,9 +3650,16 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); -void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir); +void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3246,7 +3674,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, int f2fs_inplace_write_data(struct f2fs_io_info *fio); void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr); + bool recover_curseg, bool recover_newaddr, + bool from_gc); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, @@ -3254,7 +3683,9 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio, bool add_list); + struct f2fs_io_info *fio); +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); @@ -3272,22 +3703,37 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp); +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno); +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno); + +#define DEF_FRAGMENT_SIZE 4 +#define MIN_FRAGMENT_SIZE 1 +#define MAX_FRAGMENT_SIZE 512 + +static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || + F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; +} /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks); long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); @@ -3305,25 +3751,31 @@ void f2fs_add_orphan_inode(struct inode *inode); void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); -void f2fs_update_dirty_page(struct inode *inode, struct page *page); +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); -void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3335,27 +3787,23 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio); + block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write); + blk_opf_t op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); -void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); +void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -3367,20 +3815,17 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks); -void f2fs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length); -int f2fs_release_page(struct page *page, gfp_t wait); -#ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode); -#endif + int compr_blocks, bool allow_balance); +void f2fs_write_failed(struct inode *inode, loff_t to); +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); +bool f2fs_release_folio(struct folio *folio, gfp_t wait); bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct page *page); int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c @@ -3388,16 +3833,19 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); +int __init f2fs_create_garbage_collection_cache(void); +void f2fs_destroy_garbage_collection_cache(void); /* * recovery.c */ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); +int __init f2fs_create_recovery_cache(void); +void f2fs_destroy_recovery_cache(void); /* * debug.c @@ -3413,7 +3861,6 @@ struct f2fs_stat_info { int ext_tree, zombie_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3426,23 +3873,29 @@ struct f2fs_stat_info { int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode, compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int compr_inode, swapfile_inode; + unsigned long long compr_blocks; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; - int dirty_count, node_pages, meta_pages; + int dirty_count, node_pages, meta_pages, compress_pages; + int compress_page_hit; int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; + unsigned int dirty_seg[NR_CURSEG_TYPE]; + unsigned int full_seg[NR_CURSEG_TYPE]; + unsigned int valid_blks[NR_CURSEG_TYPE]; unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; @@ -3509,9 +3962,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_add_compr_blocks(inode, blocks) \ - (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ - (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3531,22 +3992,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = F2FS_I_SB(inode)->atomic_files; \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -3607,12 +4057,11 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) -#define stat_inc_atomic_write(inode) do { } while (0) -#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -3626,13 +4075,10 @@ static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } -static inline void update_sit_info(struct f2fs_sb_info *sbi) {} +static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; -#ifdef CONFIG_UNICODE -extern const struct dentry_operations f2fs_dentry_ops; -#endif extern const struct file_operations f2fs_file_operations; extern const struct inode_operations f2fs_file_inode_operations; extern const struct address_space_operations f2fs_dblock_aops; @@ -3648,6 +4094,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct page *page, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, @@ -3657,13 +4104,13 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct page *page); -bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); +int f2fs_recover_inline_data(struct inode *inode, struct page *npage); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, - struct fscrypt_name *fname, struct page **res_page); + const struct f2fs_filename *fname, + struct page **res_page); int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage); -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, - const struct qstr *orig_name, +int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, @@ -3690,6 +4137,10 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); */ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *left_most); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -3700,9 +4151,9 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root); + struct rb_root_cached *root, bool check_key); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); unsigned int f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); @@ -3718,6 +4169,9 @@ void f2fs_destroy_extent_cache(void); /* * sysfs.c */ +#define MIN_RA_MUL 2 +#define MAX_RA_MUL 256 + int __init f2fs_init_sysfs(void); void f2fs_exit_sysfs(void); int f2fs_register_sysfs(struct f2fs_sb_info *sbi); @@ -3762,27 +4216,64 @@ int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied); +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +int f2fs_init_compress_mempool(void); +void f2fs_destroy_compress_mempool(void); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate); +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, - bool is_readahead); + bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_free_dic(struct decompress_io_ctx *dic); -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task); +void f2fs_put_page_dic(struct page *page, bool in_task); +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); -void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); +int __init f2fs_init_compress_cache(void); +void f2fs_destroy_compress_cache(void); +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr); +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr); +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); +#define inc_compr_inode_stat(inode) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + sbi->compr_new_inode++; \ + } while (0) +#define add_compr_block_stat(inode, blocks) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + int diff = F2FS_I(inode)->i_cluster_size - blocks; \ + sbi->compr_written_block += blocks; \ + sbi->compr_saved_block += diff; \ + } while (0) #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3797,36 +4288,86 @@ static inline struct page *f2fs_compress_control_page(struct page *page) WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } +static inline int f2fs_init_compress_mempool(void) { return 0; } +static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, + bool in_task) { } +static inline void f2fs_end_read_compressed_page(struct page *page, + bool failed, block_t blkaddr, bool in_task) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page, bool in_task) +{ + WARN_ON_ONCE(1); +} +static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } +static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } +static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } +static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_init_compress_cache(void) { return 0; } +static inline void f2fs_destroy_compress_cache(void) { } +static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, + block_t blkaddr) { } +static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, nid_t ino, block_t blkaddr) { } +static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, block_t blkaddr) { return false; } +static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, + nid_t ino) { } +#define inc_compr_inode_stat(inode) do { } while (0) +static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) { } #endif -static inline void set_compress_context(struct inode *inode) +static inline int set_compress_context(struct inode *inode) { +#ifdef CONFIG_F2FS_FS_COMPRESSION struct f2fs_sb_info *sbi = F2FS_I_SB(inode); F2FS_I(inode)->i_compress_algorithm = F2FS_OPTION(sbi).compress_algorithm; F2FS_I(inode)->i_log_cluster_size = F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_compress_flag = + F2FS_OPTION(sbi).compress_chksum ? + 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; + if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || + F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_flag |= + F2FS_OPTION(sbi).compress_level << + COMPRESS_LEVEL_OFFSET; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); + inc_compr_inode_stat(inode); + f2fs_mark_inode_dirty_sync(inode, true); + return 0; +#else + return -EOPNOTSUPP; +#endif } -static inline u64 f2fs_disable_compressed_file(struct inode *inode) +static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_compressed_file(inode)) - return 0; - if (fi->i_compr_blocks) - return fi->i_compr_blocks; + return true; + if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) + return false; fi->i_flags &= ~F2FS_COMPR_FL; - clear_inode_flag(inode, FI_COMPRESSED_FILE); stat_dec_compr_inode(inode); - return 0; + clear_inode_flag(inode, FI_COMPRESSED_FILE); + f2fs_mark_inode_dirty_sync(inode, true); + return true; } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3848,6 +4389,27 @@ F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); +F2FS_FEATURE_FUNCS(readonly, RO); + +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT) || + (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi))) + return false; + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + return S_ISREG(inode->i_mode); +} #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, @@ -3866,8 +4428,7 @@ static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) static inline bool f2fs_bdev_support_discard(struct block_device *bdev) { - return blk_queue_discard(bdev_get_queue(bdev)) || - bdev_is_zoned(bdev); + return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) @@ -3902,38 +4463,20 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) return false; } - -static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) { - clear_opt(sbi, ADAPTIVE); - clear_opt(sbi, LFS); - - switch (mt) { - case F2FS_MOUNT_ADAPTIVE: - set_opt(sbi, ADAPTIVE); - break; - case F2FS_MOUNT_LFS: - set_opt(sbi, LFS); - break; - } + return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } -static inline bool f2fs_may_encrypt(struct inode *inode) +static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) { -#ifdef CONFIG_FS_ENCRYPTION - umode_t mode = inode->i_mode; - - return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); -#else - return false; -#endif + return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; } static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } @@ -3941,68 +4484,37 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; + struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; + + /* don't update i_compr_blocks if saved blocks were released */ + if (!add && !atomic_read(&fi->i_compr_blocks)) + return; if (add) { - F2FS_I(inode)->i_compr_blocks += diff; + atomic_add(diff, &fi->i_compr_blocks); stat_add_compr_blocks(inode, diff); } else { - F2FS_I(inode)->i_compr_blocks -= diff; + atomic_sub(diff, &fi->i_compr_blocks); stat_sub_compr_blocks(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); } -static inline int block_unaligned_IO(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) +static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, + int flag) { - unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned int blocksize_mask = (1 << i_blkbits) - 1; - loff_t offset = iocb->ki_pos; - unsigned long align = offset | iov_iter_alignment(iter); - - return align & blocksize_mask; -} - -static inline int allow_outplace_dio(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - return (test_opt(sbi, LFS) && (rw == WRITE) && - !block_unaligned_IO(inode, iocb, iter)); + if (!f2fs_is_multi_device(sbi)) + return false; + if (flag != F2FS_GET_BLOCK_DIO) + return false; + return sbi->aligned_blksize; } -static inline bool f2fs_force_buffered_io(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - if (f2fs_post_read_required(inode)) - return true; - if (f2fs_is_multi_device(sbi)) - return true; - if (f2fs_compressed_file(inode)) - return true; - /* - * for blkzoned device, fallback direct IO to buffered IO, so - * all IOs can be serialized by log-structured write. - */ - if (f2fs_sb_has_blkzoned(sbi)) - return true; - if (test_opt(sbi, LFS) && (rw == WRITE)) { - if (block_unaligned_IO(inode, iocb, iter)) - return true; - if (F2FS_IO_ALIGNED(sbi)) - return true; - } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) && - !IS_SWAPFILE(inode)) - return true; - - return false; + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -4025,6 +4537,32 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; +} + +static inline void f2fs_io_schedule_timeout(long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(timeout); +} + +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0d4da644df3b..82cda1258227 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,10 @@ #include <linux/uuid.h> #include <linux/file.h> #include <linux/nls.h> +#include <linux/sched/signal.h> +#include <linux/fileattr.h> +#include <linux/fadvise.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" @@ -28,17 +32,19 @@ #include "xattr.h" #include "acl.h" #include "gc.h" -#include "trace.h" +#include "iostat.h" #include <trace/events/f2fs.h> +#include <uapi/linux/f2fs.h> static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret; - down_read(&F2FS_I(inode)->i_mmap_sem); ret = filemap_fault(vmf); - up_read(&F2FS_I(inode)->i_mmap_sem); + if (!ret) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); @@ -54,6 +60,12 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) bool need_alloc = true; int err = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -64,6 +76,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret = f2fs_is_compressed_cluster(inode, page->index); @@ -72,10 +88,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = ret; goto err; } else if (ret) { - if (ret < F2FS_I(inode)->i_cluster_size) { - err = -EAGAIN; - goto err; - } need_alloc = false; } } @@ -89,7 +101,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); file_update_time(vmf->vma->vm_file); - down_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || @@ -101,18 +113,24 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (need_alloc) { /* block allocation */ - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_block(&dn, page->index); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!need_alloc) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); f2fs_put_dnode(&dn); - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - if (err) { - unlock_page(page); - goto out_sem; - } + } +#endif + if (err) { + unlock_page(page); + goto out_sem; } - /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false, true); /* wait for GCed page writeback via META_MAPPING */ @@ -136,12 +154,12 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); - f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: - up_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); err: @@ -158,9 +176,11 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; - inode = igrab(inode); - dentry = d_find_any_alias(inode); - iput(inode); + /* + * Make sure to get the non-deleted alias. The alias associated with + * the open file descriptor being fsync()'ed may be deleted already. + */ + dentry = d_find_alias(inode); if (!dentry) return 0; @@ -217,13 +237,13 @@ static void try_to_fix_pino(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); nid_t pino; - down_write(&fi->i_sem); + f2fs_down_write(&fi->i_sem); if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); file_got_pino(inode); } - up_write(&fi->i_sem); + f2fs_up_write(&fi->i_sem); } static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, @@ -241,8 +261,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, }; unsigned int seq_id = 0; - if (unlikely(f2fs_readonly(inode->i_sb) || - is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); @@ -256,7 +275,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, ret = file_write_and_wait_range(file, start, end); clear_inode_flag(inode, FI_NEED_IPU); - if (ret) { + if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } @@ -281,15 +300,27 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; + } else { + /* + * for OPU case, during fsync(), node can be persisted before + * data when lower device doesn't support write barrier, result + * in data corruption after SPO. + * So for strict fsync mode, force to use atomic write sematics + * to keep write order in between data/node and last node to + * avoid potential data corruption. + */ + if (F2FS_OPTION(sbi).fsync_mode == + FSYNC_MODE_STRICT && !atomic) + atomic = true; } go_write: /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&F2FS_I(inode)->i_sem); + f2fs_down_read(&F2FS_I(inode)->i_sem); cp_reason = need_do_checkpoint(inode); - up_read(&F2FS_I(inode)->i_sem); + f2fs_up_read(&F2FS_I(inode)->i_sem); if (cp_reason) { /* all the dirty node pages should be flushed for POR */ @@ -341,7 +372,8 @@ sync_nodes: f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); @@ -351,7 +383,6 @@ flush_out: f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - f2fs_trace_ios(NULL, 1); return ret; } @@ -362,32 +393,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return f2fs_do_sync_file(file, start, end, datasync, false); } -static pgoff_t __get_first_dirty_index(struct address_space *mapping, - pgoff_t pgofs, int whence) -{ - struct page *page; - int nr_pages; - - if (whence != SEEK_DATA) - return 0; - - /* find first dirty page index */ - nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, - 1, &page); - if (!nr_pages) - return ULONG_MAX; - pgofs = page->index; - put_page(page); - return pgofs; -} - -static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, - pgoff_t dirty, pgoff_t pgofs, int whence) +static bool __found_offset(struct address_space *mapping, block_t blkaddr, + pgoff_t index, int whence) { switch (whence) { case SEEK_DATA: - if ((blkaddr == NEW_ADDR && dirty == pgofs) || - __is_valid_data_blkaddr(blkaddr)) + if (__is_valid_data_blkaddr(blkaddr)) + return true; + if (blkaddr == NEW_ADDR && + xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY)) return true; break; case SEEK_HOLE: @@ -403,7 +417,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; struct dnode_of_data dn; - pgoff_t pgofs, end_offset, dirty; + pgoff_t pgofs, end_offset; loff_t data_ofs = offset; loff_t isize; int err = 0; @@ -415,16 +429,18 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; /* handle inline data case */ - if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { - if (whence == SEEK_HOLE) + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) { data_ofs = isize; - goto found; + goto found; + } else if (whence == SEEK_DATA) { + data_ofs = offset; + goto found; + } } pgofs = (pgoff_t)(offset >> PAGE_SHIFT); - dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); @@ -448,8 +464,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), @@ -458,7 +473,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; } - if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, + if (__found_offset(file->f_mapping, blkaddr, pgofs, whence)) { f2fs_put_dnode(&dn); goto found; @@ -484,6 +499,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + switch (whence) { case SEEK_SET: case SEEK_CUR: @@ -503,7 +521,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - int err; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -511,11 +528,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - /* we don't need to use inline_data strictly */ - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; set_inode_flag(inode, FI_MMAP_FILE); @@ -551,6 +563,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) bool compressed_cluster = false; int cluster_index = 0, valid_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -589,7 +602,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); f2fs_invalidate_blocks(sbi, blkaddr); - nr_free++; + + if (!released || blkaddr != COMPRESS_ADDR) + nr_free++; } if (compressed_cluster) @@ -637,9 +652,6 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } - if (f2fs_compressed_file(inode)) - return 0; - page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); @@ -655,7 +667,7 @@ truncate_out: return 0; } -static int do_truncate_blocks(struct inode *inode, u64 from, bool lock) +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -668,7 +680,7 @@ static int do_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - if (free_from >= sbi->max_file_blocks) + if (free_from >= max_file_blocks(inode)) goto free_partial; if (lock) @@ -723,23 +735,39 @@ free_partial: int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { u64 free_from = from; + int err; +#ifdef CONFIG_F2FS_FS_COMPRESSION /* * for compressed file, only support cluster size * aligned truncation. */ - if (f2fs_compressed_file(inode)) { - size_t cluster_shift = PAGE_SHIFT + - F2FS_I(inode)->i_log_cluster_size; - size_t cluster_mask = (1 << cluster_shift) - 1; + if (f2fs_compressed_file(inode)) + free_from = round_up(from, + F2FS_I(inode)->i_cluster_size << PAGE_SHIFT); +#endif - free_from = from >> cluster_shift; - if (from & cluster_mask) - free_from++; - free_from <<= cluster_shift; + err = f2fs_do_truncate_blocks(inode, free_from, lock); + if (err) + return err; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * For compressed file, after release compress blocks, don't allow write + * direct, but we should allow write direct after truncate to zero. + */ + if (f2fs_compressed_file(inode) && !free_from + && is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + + if (from != free_from) { + err = f2fs_truncate_partial_cluster(inode, from, lock); + if (err) + return err; } +#endif - return do_truncate_blocks(inode, free_from, lock); + return 0; } int f2fs_truncate(struct inode *inode) @@ -760,6 +788,10 @@ int f2fs_truncate(struct inode *inode) return -EIO; } + err = f2fs_dquot_initialize(inode); + if (err) + return err; + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -776,12 +808,40 @@ int f2fs_truncate(struct inode *inode) return 0; } -int f2fs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +static bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!fscrypt_dio_supported(inode)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) + return true; + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; + if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) + return true; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + return true; + + return false; +} + +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_inode *ri; + struct f2fs_inode *ri = NULL; unsigned int flags; if (f2fs_has_extra_attr(inode) && @@ -792,7 +852,27 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + * + * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN + * cannot represent that, so in that case we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + unsigned int bsize = i_blocksize(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (!f2fs_force_buffered_io(inode, WRITE)) { + stat->dio_mem_align = bsize; + stat->dio_offset_align = bsize; + } + } + flags = fi->i_flags; + if (flags & F2FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (IS_ENCRYPTED(inode)) @@ -804,13 +884,14 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, if (IS_VERITY(inode)) stat->attributes |= STATX_ATTR_VERITY; - stat->attributes_mask |= (STATX_ATTR_APPEND | + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_APPEND | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(inode, stat); + generic_fillattr(mnt_userns, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -821,14 +902,13 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, } #ifdef CONFIG_F2FS_FS_POSIX_ACL -static void __setattr_copy(struct inode *inode, const struct iattr *attr) +static void __setattr_copy(struct user_namespace *mnt_userns, + struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -837,8 +917,10 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -847,7 +929,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) #define __setattr_copy setattr_copy #endif -int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; @@ -855,11 +938,19 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(mnt_userns, dentry, attr); if (err) return err; @@ -871,17 +962,15 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if (is_quota_modification(inode, attr)) { - err = dquot_initialize(inode); + if (is_quota_modification(mnt_userns, inode, attr)) { + err = f2fs_dquot_initialize(inode); if (err) return err; } - if ((attr->ia_valid & ATTR_UID && - !uid_eq(attr->ia_uid, inode->i_uid)) || - (attr->ia_valid & ATTR_GID && - !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(inode, attr); + err = dquot_transfer(mnt_userns, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -892,10 +981,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } @@ -913,8 +1000,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; } - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, attr->ia_size); @@ -924,23 +1011,25 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; - down_write(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); inode->i_mtime = inode->i_ctime = current_time(inode); F2FS_I(inode)->last_disk_size = i_size_read(inode); - up_write(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(inode, attr); + __setattr_copy(mnt_userns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); - if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; + err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode)); + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); } } @@ -959,10 +1048,10 @@ const struct inode_operations f2fs_file_inode_operations = { .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, }; static int fill_zero(struct inode *inode, pgoff_t index, @@ -1057,7 +1146,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (pg_start < pg_end) { - struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1066,18 +1154,17 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); - truncate_inode_pages_range(mapping, blk_start, - blk_end - 1); + truncate_pagecache_range(inode, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1109,19 +1196,19 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + *blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(*blkaddr) && !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { - if (test_opt(sbi, LFS)) { + if (f2fs_lfs_mode(sbi)) { f2fs_put_dnode(&dn); return -EOPNOTSUPP; } @@ -1189,7 +1276,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - ret = f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (ret) { f2fs_put_dnode(&dn); return ret; @@ -1199,8 +1286,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + dn.data_blkaddr = f2fs_data_blkaddr(&dn); f2fs_truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1234,7 +1320,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, f2fs_put_page(psrc, 1); return PTR_ERR(pdst); } - f2fs_copy_page(psrc, pdst); + memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE); set_page_dirty(pdst); f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); @@ -1312,8 +1398,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1321,8 +1407,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1352,15 +1438,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); new_size = i_size_read(inode) - len; - truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (!ret) f2fs_i_size_write(inode, new_size); return ret; @@ -1376,8 +1460,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->inode, dn->node_page, - dn->ofs_in_node) == NULL_ADDR) + if (f2fs_data_blkaddr(dn) == NULL_ADDR) count++; } @@ -1388,8 +1471,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); /* * f2fs_reserve_new_blocks will not guarantee entire block * allocation. @@ -1398,11 +1480,20 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, ret = -ENOSPC; break; } - if (dn->data_blkaddr != NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + break; } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); } f2fs_update_extent_cache_range(dn, start, 0, index - start); @@ -1461,8 +1552,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, unsigned int end_offset; pgoff_t end; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1474,8 +1565,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1486,8 +1577,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1521,6 +1612,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1543,14 +1635,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; /* write out all dirty pages from offset */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); if (ret) return ret; @@ -1560,8 +1652,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1577,14 +1669,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); - filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + filemap_invalidate_lock(mapping); + filemap_write_and_wait_range(mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1598,9 +1690,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; - pgoff_t pg_end; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 0 }; + pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; + block_t expanded = 0; int err; err = inode_newsize_ok(inode, (len + offset)); @@ -1613,11 +1711,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); + pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; off_end = (offset + len) & (PAGE_SIZE - 1); - map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; - map.m_len = pg_end - map.m_lblk; + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; if (off_end) map.m_len++; @@ -1625,47 +1724,50 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t len = (map.m_len >> sbi->log_blocks_per_seg) << - sbi->log_blocks_per_seg; - block_t done = 0; - - if (map.m_len % sbi->blocks_per_seg) - len += sbi->blocks_per_seg; + block_t sec_blks = CAP_BLKS_PER_SEC(sbi); + block_t sec_len = roundup(map.m_len, sec_blks); - map.m_len = sbi->blocks_per_seg; + map.m_len = sec_blks; next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { - down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc(sbi, &gc_control); + if (err && err != -ENODATA) goto out_err; } - down_write(&sbi->pin_sem); + f2fs_down_write(&sbi->pin_sem); + + f2fs_lock_op(sbi); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); + map.m_seg_type = CURSEG_COLD_DATA_PINNED; - f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA); err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); - up_write(&sbi->pin_sem); + file_dont_truncate(inode); + + f2fs_up_write(&sbi->pin_sem); - done += map.m_len; - len -= map.m_len; + expanded += map.m_len; + sec_len -= map.m_len; map.m_lblk += map.m_len; - if (!err && len) + if (!err && sec_len) goto next_alloc; - map.m_len = done; + map.m_len = expanded; } else { err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + expanded = map.m_len; } out_err: if (err) { pgoff_t last_off; - if (!map.m_len) + if (!expanded) return err; - last_off = map.m_lblk + map.m_len - 1; + last_off = pg_start + expanded - 1; /* update new size to the failed position */ new_size = (last_off == pg_end) ? offset + len : @@ -1705,7 +1807,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; - if (f2fs_compressed_file(inode) && + /* + * Pinned file should not support partial trucation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -1717,6 +1823,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + ret = file_modified(file); + if (ret) + goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) goto out; @@ -1755,16 +1865,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1778,21 +1879,24 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * until all the writers close its file. Since this should be done * before dropping file lock, it needs to do in ->flush. */ - if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + if (F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) { struct f2fs_inode_info *fi = F2FS_I(inode); + u32 masked_flags = fi->i_flags & mask; + + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) return -EPERM; - if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) { + if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) { if (!f2fs_sb_has_casefold(F2FS_I_SB(inode))) return -EOPNOTSUPP; if (!f2fs_empty_dir(inode)) @@ -1806,29 +1910,19 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return -EINVAL; } - if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) { - if (S_ISREG(inode->i_mode) && - (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) || - F2FS_HAS_BLOCKS(inode))) - return -EINVAL; - if (iflags & F2FS_NOCOMP_FL) - return -EINVAL; - if (iflags & F2FS_COMPR_FL) { - int err = f2fs_convert_inline_inode(inode); - - if (err) - return err; - + if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { + if (masked_flags & F2FS_COMPR_FL) { + if (!f2fs_disable_compressed_file(inode)) + return -EINVAL; + } else { if (!f2fs_may_compress(inode)) return -EINVAL; - - set_compress_context(inode); + if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) + return -EINVAL; + if (set_compress_context(inode)) + return -EOPNOTSUPP; } } - if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) { - if (fi->i_flags & F2FS_COMPR_FL) - return -EINVAL; - } fi->i_flags = iflags | (fi->i_flags & ~mask); f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) && @@ -1845,13 +1939,16 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return 0; } -/* FS_IOC_GETFLAGS and FS_IOC_SETFLAGS support */ +/* FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR support */ /* * To make a new on-disk f2fs i_flag gettable via FS_IOC_GETFLAGS, add an entry * for it to f2fs_fsflags_map[], and add its FS_*_FL equivalent to * F2FS_GETTABLE_FS_FL. To also make it settable via FS_IOC_SETFLAGS, also add * its FS_*_FL equivalent to F2FS_SETTABLE_FS_FL. + * + * Translating flags to fsx_flags value used by FS_IOC_FSGETXATTR and + * FS_IOC_FSSETXATTR is done by the VFS. */ static const struct { @@ -1926,67 +2023,6 @@ static inline u32 f2fs_fsflags_to_iflags(u32 fsflags) return iflags; } -static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct f2fs_inode_info *fi = F2FS_I(inode); - u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags); - - if (IS_ENCRYPTED(inode)) - fsflags |= FS_ENCRYPT_FL; - if (IS_VERITY(inode)) - fsflags |= FS_VERITY_FL; - if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) - fsflags |= FS_INLINE_DATA_FL; - if (is_inode_flag_set(inode, FI_PIN_FILE)) - fsflags |= FS_NOCOW_FL; - - fsflags &= F2FS_GETTABLE_FS_FL; - - return put_user(fsflags, (int __user *)arg); -} - -static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct f2fs_inode_info *fi = F2FS_I(inode); - u32 fsflags, old_fsflags; - u32 iflags; - int ret; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (get_user(fsflags, (int __user *)arg)) - return -EFAULT; - - if (fsflags & ~F2FS_GETTABLE_FS_FL) - return -EOPNOTSUPP; - fsflags &= F2FS_SETTABLE_FS_FL; - - iflags = f2fs_fsflags_to_iflags(fsflags); - if (f2fs_mask_flags(inode->i_mode, iflags) != iflags) - return -EOPNOTSUPP; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - old_fsflags = f2fs_iflags_to_fsflags(fi->i_flags); - ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); - if (ret) - goto out; - - ret = f2fs_setflags_common(inode, iflags, - f2fs_fsflags_to_iflags(F2FS_SETTABLE_FS_FL)); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1997,11 +2033,13 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) static int f2fs_ioc_start_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2016,47 +2054,60 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - f2fs_disable_compressed_file(inode); - - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (!f2fs_disable_compressed_file(inode)) { + ret = -EINVAL; goto out; } + if (f2fs_is_atomic_file(inode)) + goto out; + ret = f2fs_convert_inline_inode(inode); if (ret) goto out; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); - sbi->atomic_files++; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + + stat_inc_atomic_inode(inode); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + set_inode_flag(fi->cow_inode, FI_COW_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); + fi->atomic_write_cnt = 0; out: inode_unlock(inode); mnt_drop_write_file(filp); @@ -2066,9 +2117,10 @@ out: static int f2fs_ioc_commit_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2079,119 +2131,39 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); + ret = f2fs_commit_atomic_write(inode); if (ret) - goto err_out; + goto unlock_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, false); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } +unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; } -static int f2fs_ioc_start_volatile_write(struct file *filp) +static int f2fs_ioc_abort_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - ret = mnt_want_write_file(filp); if (ret) return ret; inode_lock(inode); - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + f2fs_abort_atomic_write(inode, true); inode_unlock(inode); @@ -2216,38 +2188,42 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (in != F2FS_GOING_DOWN_FULLSYNC) { ret = mnt_want_write_file(filp); - if (ret) + if (ret) { + if (ret == -EROFS) { + ret = 0; + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + trace_f2fs_shutdown(sbi, in, ret); + } return ret; + } } switch (in) { case F2FS_GOING_DOWN_FULLSYNC: - sb = freeze_bdev(sb->s_bdev); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); + ret = freeze_bdev(sb->s_bdev); + if (ret) goto out; - } - if (sb) { - f2fs_stop_checkpoint(sbi, false); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); - thaw_bdev(sb->s_bdev, sb); - } + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + thaw_bdev(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: @@ -2282,7 +2258,6 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret; @@ -2301,7 +2276,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) return ret; range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); ret = f2fs_trim_fs(F2FS_SB(sb), &range); mnt_drop_write_file(filp); if (ret < 0) @@ -2356,7 +2331,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) return err; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) goto got_it; @@ -2375,7 +2350,7 @@ got_it: 16)) err = -EFAULT; out_err: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); return err; } @@ -2423,10 +2398,22 @@ static int f2fs_ioc_get_encryption_key_status(struct file *filp, return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); } +static int f2fs_ioc_get_encryption_nonce(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); +} + static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2444,40 +2431,41 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; } -static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { - struct inode *inode = file_inode(filp); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_gc_range range; + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - - if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, - sizeof(range))) - return -EFAULT; - if (f2fs_readonly(sbi->sb)) return -EROFS; - end = range.start + range.len; - if (end < range.start || range.start < MAIN_BLKADDR(sbi) || + end = range->start + range->len; + if (end < range->start || range->start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) return -EINVAL; @@ -2486,24 +2474,40 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; do_more: - if (!range.sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!range->sync) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); - range.start += BLKS_PER_SEC(sbi); - if (range.start <= end) + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); + if (ret) { + if (ret == -EBUSY) + ret = -EAGAIN; + goto out; + } + range->start += CAP_BLKS_PER_SEC(sbi); + if (range->start <= end) goto do_more; out: mnt_drop_write_file(filp); return ret; } +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct f2fs_gc_range range; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_gc_range(filp, &range); +} + static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2537,7 +2541,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, - .m_seg_type = NO_CHECK_TYPE , + .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; @@ -2547,10 +2551,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, bool fragmented = false; int err; - /* if in-place-update policy is enabled, don't waste time here */ - if (f2fs_should_update_inplace(inode, NULL)) - return -EINVAL; - pg_start = range->start >> PAGE_SHIFT; pg_end = (range->start + range->len) >> PAGE_SHIFT; @@ -2558,6 +2558,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, inode_lock(inode); + /* if in-place-update policy is enabled, don't waste time here */ + set_inode_flag(inode, FI_OPU_WRITE); + if (f2fs_should_update_inplace(inode, NULL)) { + err = -EINVAL; + goto out; + } + /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, range->start + range->len - 1); @@ -2608,7 +2615,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; } - sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi)); + sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi)); /* * make sure there are enough free section for LFS allocation, this can @@ -2639,7 +2646,7 @@ do_map: goto check; } - set_inode_flag(inode, FI_DO_DEFRAG); + set_inode_flag(inode, FI_SKIP_WRITES); idx = map.m_lblk; while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { @@ -2652,6 +2659,7 @@ do_map: } set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); idx++; @@ -2664,15 +2672,16 @@ check: if (map.m_lblk < pg_end && cnt < blk_per_seg) goto do_map; - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); err = filemap_fdatawrite(inode->i_mapping); if (err) goto out; } clear_out: - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); out: + clear_inode_flag(inode, FI_OPU_WRITE); inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_SHIFT; @@ -2704,7 +2713,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) return -EINVAL; if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) + max_file_blocks(inode))) return -EINVAL; err = mnt_want_write_file(filp); @@ -2748,6 +2757,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) return -EOPNOTSUPP; + if (pos_out < 0 || pos_in < 0) + return -EINVAL; + if (src == dst) { if (pos_in == pos_out) return 0; @@ -2805,10 +2817,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) goto out_src; } @@ -2826,9 +2838,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_unlock_op(sbi); if (src != dst) - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); out_src: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); out_unlock: if (src != dst) inode_unlock(dst); @@ -2837,9 +2849,9 @@ out: return ret; } -static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +static int __f2fs_ioc_move_range(struct file *filp, + struct f2fs_move_range *range) { - struct f2fs_move_range range; struct fd dst; int err; @@ -2847,11 +2859,7 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) !(filp->f_mode & FMODE_WRITE)) return -EBADF; - if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, - sizeof(range))) - return -EFAULT; - - dst = fdget(range.dst_fd); + dst = fdget(range->dst_fd); if (!dst.file) return -EBADF; @@ -2864,21 +2872,25 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) if (err) goto err_out; - err = f2fs_move_file_range(filp, range.pos_in, dst.file, - range.pos_out, range.len); + err = f2fs_move_file_range(filp, range->pos_in, dst.file, + range->pos_out, range->len); mnt_drop_write_file(filp); - if (err) - goto err_out; - - if (copy_to_user((struct f2fs_move_range __user *)arg, - &range, sizeof(range))) - err = -EFAULT; err_out: fdput(dst); return err; } +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_move_range(filp, &range); +} + static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2887,6 +2899,11 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2923,14 +2940,16 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) @@ -2971,12 +2990,11 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) return err; } -static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) { - struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *ipage; + struct f2fs_inode *ri = NULL; kprojid_t kprojid; int err; @@ -2992,7 +3010,7 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -3000,19 +3018,10 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (IS_NOQUOTA(inode)) return err; - ipage = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + return -EOVERFLOW; - if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, - i_projid)) { - err = -EOVERFLOW; - f2fs_put_page(ipage, 1); - return err; - } - f2fs_put_page(ipage, 1); - - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; @@ -3021,7 +3030,7 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -3034,7 +3043,7 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) return 0; } -static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) { if (projid != F2FS_DEF_PROJID) return -EOPNOTSUPP; @@ -3042,123 +3051,55 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) } #endif -/* FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR support */ - -/* - * To make a new on-disk f2fs i_flag gettable via FS_IOC_FSGETXATTR and settable - * via FS_IOC_FSSETXATTR, add an entry for it to f2fs_xflags_map[], and add its - * FS_XFLAG_* equivalent to F2FS_SUPPORTED_XFLAGS. - */ - -static const struct { - u32 iflag; - u32 xflag; -} f2fs_xflags_map[] = { - { F2FS_SYNC_FL, FS_XFLAG_SYNC }, - { F2FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE }, - { F2FS_APPEND_FL, FS_XFLAG_APPEND }, - { F2FS_NODUMP_FL, FS_XFLAG_NODUMP }, - { F2FS_NOATIME_FL, FS_XFLAG_NOATIME }, - { F2FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT }, -}; - -#define F2FS_SUPPORTED_XFLAGS ( \ - FS_XFLAG_SYNC | \ - FS_XFLAG_IMMUTABLE | \ - FS_XFLAG_APPEND | \ - FS_XFLAG_NODUMP | \ - FS_XFLAG_NOATIME | \ - FS_XFLAG_PROJINHERIT) - -/* Convert f2fs on-disk i_flags to FS_IOC_FS{GET,SET}XATTR flags */ -static inline u32 f2fs_iflags_to_xflags(u32 iflags) -{ - u32 xflags = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++) - if (iflags & f2fs_xflags_map[i].iflag) - xflags |= f2fs_xflags_map[i].xflag; - - return xflags; -} - -/* Convert FS_IOC_FS{GET,SET}XATTR flags to f2fs on-disk i_flags */ -static inline u32 f2fs_xflags_to_iflags(u32 xflags) -{ - u32 iflags = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++) - if (xflags & f2fs_xflags_map[i].xflag) - iflags |= f2fs_xflags_map[i].iflag; - - return iflags; -} - -static void f2fs_fill_fsxattr(struct inode *inode, struct fsxattr *fa) +int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { + struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); + u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags); + + if (IS_ENCRYPTED(inode)) + fsflags |= FS_ENCRYPT_FL; + if (IS_VERITY(inode)) + fsflags |= FS_VERITY_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + fsflags |= FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + fsflags |= FS_NOCOW_FL; - simple_fill_fsxattr(fa, f2fs_iflags_to_xflags(fi->i_flags)); + fileattr_fill_flags(fa, fsflags & F2FS_GETTABLE_FS_FL); if (f2fs_sb_has_project_quota(F2FS_I_SB(inode))) fa->fsx_projid = from_kprojid(&init_user_ns, fi->i_projid); -} -static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - struct fsxattr fa; - - f2fs_fill_fsxattr(inode, &fa); - - if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa))) - return -EFAULT; return 0; } -static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) +int f2fs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(filp); - struct fsxattr fa, old_fa; + struct inode *inode = d_inode(dentry); + u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL; u32 iflags; int err; - if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa))) - return -EFAULT; - - /* Make sure caller has proper permission */ - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS) + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) + return -ENOSPC; + if (fsflags & ~F2FS_GETTABLE_FS_FL) return -EOPNOTSUPP; + fsflags &= F2FS_SETTABLE_FS_FL; + if (!fa->flags_valid) + mask &= FS_COMMON_FL; - iflags = f2fs_xflags_to_iflags(fa.fsx_xflags); + iflags = f2fs_fsflags_to_iflags(fsflags); if (f2fs_mask_flags(inode->i_mode, iflags) != iflags) return -EOPNOTSUPP; - err = mnt_want_write_file(filp); - if (err) - return err; - - inode_lock(inode); - - f2fs_fill_fsxattr(inode, &old_fa); - err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa); - if (err) - goto out; - - err = f2fs_setflags_common(inode, iflags, - f2fs_xflags_to_iflags(F2FS_SUPPORTED_XFLAGS)); - if (err) - goto out; + err = f2fs_setflags_common(inode, iflags, f2fs_fsflags_to_iflags(mask)); + if (!err) + err = f2fs_ioc_setproject(inode, fa->fsx_projid); - err = f2fs_ioc_setproject(filp, fa.fsx_projid); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); return err; } @@ -3203,17 +3144,17 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (f2fs_should_update_outplace(inode, NULL)) { - ret = -EINVAL; - goto out; - } - if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); f2fs_i_gc_failures_write(inode, 0); goto done; } + if (f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + if (f2fs_pin_file_control(inode, false)) { ret = -EAGAIN; goto out; @@ -3223,7 +3164,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (ret) goto out; - if (f2fs_disable_compressed_file(inode)) { + if (!f2fs_disable_compressed_file(inode)) { ret = -EOPNOTSUPP; goto out; } @@ -3264,21 +3205,21 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = F2FS_I_SB(inode)->max_file_blocks; + end = max_file_blocks(inode); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; - down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; map.m_lblk = m_next_extent; } - return err; + return 0; } static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) @@ -3290,7 +3231,6 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); __u64 block_count; - int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3302,9 +3242,7 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) sizeof(block_count))) return -EFAULT; - ret = f2fs_resize_fs(sbi, block_count); - - return ret; + return f2fs_resize_fs(sbi, block_count); } static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) @@ -3315,7 +3253,7 @@ static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { f2fs_warn(F2FS_I_SB(inode), - "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n", + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", inode->i_ino); return -EOPNOTSUPP; } @@ -3331,7 +3269,15 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) return fsverity_ioctl_measure(filp, (void __user *)arg); } -static int f2fs_get_volume_name(struct file *filp, unsigned long arg) +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); +} + +static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3343,21 +3289,21 @@ static int f2fs_get_volume_name(struct file *filp, unsigned long arg) if (!vbuf) return -ENOMEM; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); count = utf16s_to_utf8s(sbi->raw_super->volume_name, ARRAY_SIZE(sbi->raw_super->volume_name), UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME); - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); if (copy_to_user((char __user *)arg, vbuf, min(FSLABEL_MAX, count))) err = -EFAULT; - kvfree(vbuf); + kfree(vbuf); return err; } -static int f2fs_set_volume_name(struct file *filp, unsigned long arg) +static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3375,7 +3321,7 @@ static int f2fs_set_volume_name(struct file *filp, unsigned long arg) if (err) goto out; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); memset(sbi->raw_super->volume_name, 0, sizeof(sbi->raw_super->volume_name)); @@ -3385,7 +3331,7 @@ static int f2fs_set_volume_name(struct file *filp, unsigned long arg) err = f2fs_commit_super(sbi, false); - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); out: @@ -3393,39 +3339,815 @@ out: return err; } -long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) - return -EIO; - if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) - return -ENOSPC; + struct inode *inode = file_inode(filp); + __u64 blocks; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); + return put_user(blocks, (u64 __user *)arg); +} + +static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int released_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + } + + while (count) { + int compr_blocks = 0; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + blkaddr = f2fs_data_blkaddr(dn); + + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + continue; + dn->ofs_in_node += cluster_size; + goto next; + } + + if (__is_valid_data_blkaddr(blkaddr)) + compr_blocks++; + + if (blkaddr != NEW_ADDR) + continue; + + dn->data_blkaddr = NULL_ADDR; + f2fs_set_data_blkaddr(dn); + } + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false); + dec_valid_block_count(sbi, dn->inode, + cluster_size - compr_blocks); + + released_blocks += cluster_size - compr_blocks; +next: + count -= cluster_size; + } + + return released_blocks; +} + +static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int released_blocks = 0; + int ret; + int writecount; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + writecount = atomic_read(&inode->i_writecount); + if ((filp->f_mode & FMODE_WRITE && writecount != 1) || + (!(filp->f_mode & FMODE_WRITE) && writecount)) { + ret = -EBUSY; + goto out; + } + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + set_inode_flag(inode, FI_COMPRESS_RELEASED); + inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, true); + + if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) + goto out; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = round_up(count, F2FS_I(inode)->i_cluster_size); + + ret = release_compress_blocks(&dn, count); + + f2fs_put_dnode(&dn); + + if (ret < 0) + break; + + page_idx += count; + released_blocks += ret; + } + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); +out: + inode_unlock(inode); + + mnt_drop_write_file(filp); + + if (ret >= 0) { + ret = put_user(released_blocks, (u64 __user *)arg); + } else if (released_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + "iblocks=%llu, released=%u, compr_blocks=%u, " + "run fsck to fix.", + __func__, inode->i_ino, inode->i_blocks, + released_blocks, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); + } + + return ret; +} + +static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int reserved_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + } + + while (count) { + int compr_blocks = 0; + blkcnt_t reserved; + int ret; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + blkaddr = f2fs_data_blkaddr(dn); + + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + continue; + dn->ofs_in_node += cluster_size; + goto next; + } + + if (__is_valid_data_blkaddr(blkaddr)) { + compr_blocks++; + continue; + } + + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); + } + + reserved = cluster_size - compr_blocks; + ret = inc_valid_block_count(sbi, dn->inode, &reserved); + if (ret) + return ret; + + if (reserved != cluster_size - compr_blocks) + return -ENOSPC; + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true); + + reserved_blocks += reserved; +next: + count -= cluster_size; + } + + return reserved_blocks; +} +static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int reserved_blocks = 0; + int ret; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) + goto out; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto unlock_inode; + } + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = round_up(count, F2FS_I(inode)->i_cluster_size); + + ret = reserve_compress_blocks(&dn, count); + + f2fs_put_dnode(&dn); + + if (ret < 0) + break; + + page_idx += count; + reserved_blocks += ret; + } + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + if (ret >= 0) { + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, true); + } +unlock_inode: + inode_unlock(inode); +out: + mnt_drop_write_file(filp); + + if (ret >= 0) { + ret = put_user(reserved_blocks, (u64 __user *)arg); + } else if (reserved_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + "iblocks=%llu, reserved=%u, compr_blocks=%u, " + "run fsck to fix.", + __func__, inode->i_ino, inode->i_blocks, + reserved_blocks, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); + } + + return ret; +} + +static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, + pgoff_t off, block_t block, block_t len, u32 flags) +{ + sector_t sector = SECTOR_FROM_BLOCK(block); + sector_t nr_sects = SECTOR_FROM_BLOCK(len); + int ret = 0; + + if (flags & F2FS_TRIM_FILE_DISCARD) { + if (bdev_max_secure_erase_sectors(bdev)) + ret = blkdev_issue_secure_erase(bdev, sector, nr_sects, + GFP_NOFS); + else + ret = blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS); + } + + if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { + if (IS_ENCRYPTED(inode)) + ret = fscrypt_zeroout_range(inode, off, block, len); + else + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, + GFP_NOFS, 0); + } + + return ret; +} + +static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct block_device *prev_bdev = NULL; + struct f2fs_sectrim_range range; + pgoff_t index, pg_end, prev_index = 0; + block_t prev_block = 0, len = 0; + loff_t end_addr; + bool to_end = false; + int ret = 0; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_sectrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (range.flags == 0 || (range.flags & ~F2FS_TRIM_FILE_MASK) || + !S_ISREG(inode->i_mode)) + return -EINVAL; + + if (((range.flags & F2FS_TRIM_FILE_DISCARD) && + !f2fs_hw_support_discard(sbi)) || + ((range.flags & F2FS_TRIM_FILE_ZEROOUT) && + IS_ENCRYPTED(inode) && f2fs_is_multi_device(sbi))) + return -EOPNOTSUPP; + + file_start_write(filp); + inode_lock(inode); + + if (f2fs_is_atomic_file(inode) || f2fs_compressed_file(inode) || + range.start >= inode->i_size) { + ret = -EINVAL; + goto err; + } + + if (range.len == 0) + goto err; + + if (inode->i_size - range.start > range.len) { + end_addr = range.start + range.len; + } else { + end_addr = range.len == (u64)-1 ? + sbi->sb->s_maxbytes : inode->i_size; + to_end = true; + } + + if (!IS_ALIGNED(range.start, F2FS_BLKSIZE) || + (!to_end && !IS_ALIGNED(end_addr, F2FS_BLKSIZE))) { + ret = -EINVAL; + goto err; + } + + index = F2FS_BYTES_TO_BLK(range.start); + pg_end = DIV_ROUND_UP(end_addr, F2FS_BLKSIZE); + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto err; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); + + ret = filemap_write_and_wait_range(mapping, range.start, + to_end ? LLONG_MAX : end_addr - 1); + if (ret) + goto out; + + truncate_inode_pages_range(mapping, range.start, + to_end ? -1 : end_addr - 1); + + while (index < pg_end) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + int i; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + index = f2fs_get_next_page_offset(&dn, index); + continue; + } + goto out; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, pg_end - index); + for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { + struct block_device *cur_bdev; + block_t blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto out; + } + + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL); + if (f2fs_is_multi_device(sbi)) { + int di = f2fs_target_device_index(sbi, blkaddr); + + blkaddr -= FDEV(di).start_blk; + } + + if (len) { + if (prev_bdev == cur_bdev && + index == prev_index + len && + blkaddr == prev_block + len) { + len++; + } else { + ret = f2fs_secure_erase(prev_bdev, + inode, prev_index, prev_block, + len, range.flags); + if (ret) { + f2fs_put_dnode(&dn); + goto out; + } + + len = 0; + } + } + + if (!len) { + prev_bdev = cur_bdev; + prev_index = index; + prev_block = blkaddr; + len = 1; + } + } + + f2fs_put_dnode(&dn); + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + cond_resched(); + } + + if (len) + ret = f2fs_secure_erase(prev_bdev, inode, prev_index, + prev_block, len, range.flags); +out: + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); +err: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int f2fs_ioc_get_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_comp_option option; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + inode_lock_shared(inode); + + if (!f2fs_compressed_file(inode)) { + inode_unlock_shared(inode); + return -ENODATA; + } + + option.algorithm = F2FS_I(inode)->i_compress_algorithm; + option.log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + + inode_unlock_shared(inode); + + if (copy_to_user((struct f2fs_comp_option __user *)arg, &option, + sizeof(option))) + return -EFAULT; + + return 0; +} + +static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_comp_option option; + int ret = 0; + + if (!f2fs_sb_has_compression(sbi)) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&option, (struct f2fs_comp_option __user *)arg, + sizeof(option))) + return -EFAULT; + + if (!f2fs_compressed_file(inode) || + option.log_cluster_size < MIN_COMPRESS_LOG_SIZE || + option.log_cluster_size > MAX_COMPRESS_LOG_SIZE || + option.algorithm >= COMPRESS_MAX) + return -EINVAL; + + file_start_write(filp); + inode_lock(inode); + + if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { + ret = -EBUSY; + goto out; + } + + if (inode->i_size != 0) { + ret = -EFBIG; + goto out; + } + + F2FS_I(inode)->i_compress_algorithm = option.algorithm; + F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size; + f2fs_mark_inode_dirty_sync(inode, true); + + if (!f2fs_is_compress_backend_ready(inode)) + f2fs_warn(sbi, "compression algorithm is successfully set, " + "but current kernel doesn't support this algorithm."); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) +{ + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx); + struct address_space *mapping = inode->i_mapping; + struct page *page; + pgoff_t redirty_idx = page_idx; + int i, page_len = 0, ret = 0; + + page_cache_ra_unbounded(&ractl, len, 0); + + for (i = 0; i < len; i++, page_idx++) { + page = read_cache_page(mapping, page_idx, NULL, NULL); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + break; + } + page_len++; + } + + for (i = 0; i < page_len; i++, redirty_idx++) { + page = find_lock_page(mapping, redirty_idx); + + /* It will never fail, when page has pinned above */ + f2fs_bug_on(F2FS_I_SB(inode), !page); + + set_page_dirty(page); + f2fs_put_page(page, 1); + f2fs_put_page(page, 0); + } + + return ret; +} + +static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = fi->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + if (!atomic_read(&fi->i_compr_blocks)) + goto out; + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + set_inode_flag(inode, FI_ENABLE_COMPRESS); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + clear_inode_flag(inode, FI_ENABLE_COMPRESS); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ switch (cmd) { - case F2FS_IOC_GETFLAGS: - return f2fs_ioc_getflags(filp, arg); - case F2FS_IOC_SETFLAGS: - return f2fs_ioc_setflags(filp, arg); - case F2FS_IOC_GETVERSION: + case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: return f2fs_ioc_start_atomic_write(filp); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_ABORT_ATOMIC_WRITE: + return f2fs_ioc_abort_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); - case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: return f2fs_ioc_fitrim(filp, arg); - case F2FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: return f2fs_ioc_set_encryption_policy(filp, arg); - case F2FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY: return f2fs_ioc_get_encryption_policy(filp, arg); - case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_PWSALT: return f2fs_ioc_get_encryption_pwsalt(filp, arg); case FS_IOC_GET_ENCRYPTION_POLICY_EX: return f2fs_ioc_get_encryption_policy_ex(filp, arg); @@ -3437,6 +4159,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_remove_encryption_key_all_users(filp, arg); case FS_IOC_GET_ENCRYPTION_KEY_STATUS: return f2fs_ioc_get_encryption_key_status(filp, arg); + case FS_IOC_GET_ENCRYPTION_NONCE: + return f2fs_ioc_get_encryption_nonce(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); case F2FS_IOC_GARBAGE_COLLECT_RANGE: @@ -3451,10 +4175,6 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_flush_device(filp, arg); case F2FS_IOC_GET_FEATURES: return f2fs_ioc_get_features(filp, arg); - case F2FS_IOC_FSGETXATTR: - return f2fs_ioc_fsgetxattr(filp, arg); - case F2FS_IOC_FSSETXATTR: - return f2fs_ioc_fssetxattr(filp, arg); case F2FS_IOC_GET_PIN_FILE: return f2fs_ioc_get_pin_file(filp, arg); case F2FS_IOC_SET_PIN_FILE: @@ -3467,172 +4187,666 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_enable_verity(filp, arg); case FS_IOC_MEASURE_VERITY: return f2fs_ioc_measure_verity(filp, arg); - case F2FS_IOC_GET_VOLUME_NAME: - return f2fs_get_volume_name(filp, arg); - case F2FS_IOC_SET_VOLUME_NAME: - return f2fs_set_volume_name(filp, arg); + case FS_IOC_READ_VERITY_METADATA: + return f2fs_ioc_read_verity_metadata(filp, arg); + case FS_IOC_GETFSLABEL: + return f2fs_ioc_getfslabel(filp, arg); + case FS_IOC_SETFSLABEL: + return f2fs_ioc_setfslabel(filp, arg); + case F2FS_IOC_GET_COMPRESS_BLOCKS: + return f2fs_get_compress_blocks(filp, arg); + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + return f2fs_release_compress_blocks(filp, arg); + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + return f2fs_reserve_compress_blocks(filp, arg); + case F2FS_IOC_SEC_TRIM_FILE: + return f2fs_sec_trim_file(filp, arg); + case F2FS_IOC_GET_COMPRESS_OPTION: + return f2fs_ioc_get_compress_option(filp, arg); + case F2FS_IOC_SET_COMPRESS_OPTION: + return f2fs_ioc_set_compress_option(filp, arg); + case F2FS_IOC_DECOMPRESS_FILE: + return f2fs_ioc_decompress_file(filp, arg); + case F2FS_IOC_COMPRESS_FILE: + return f2fs_ioc_compress_file(filp, arg); default: return -ENOTTY; } } -static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) + return -ENOSPC; + + return __f2fs_ioctl(filp, cmd, arg); +} + +/* + * Return %true if the given read or write request should use direct I/O, or + * %false if it should use buffered I/O. + */ +static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, + struct iov_iter *iter) +{ + unsigned int align; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return false; + + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) + return false; + + /* + * Direct I/O not aligned to the disk's logical_block_size will be + * attempted, but will fail with -EINVAL. + * + * f2fs additionally requires that direct I/O be aligned to the + * filesystem block size, which is often a stricter requirement. + * However, f2fs traditionally falls back to buffered I/O on requests + * that are logical_block_size-aligned but not fs-block aligned. + * + * The below logic implements this behavior. + */ + align = iocb->ki_pos | iov_iter_alignment(iter); + if (!IS_ALIGNED(align, i_blocksize(inode)) && + IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) + return false; + + return true; +} + +static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_READ); + if (error) + return error; + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = { + .end_io = f2fs_dio_read_end_io, +}; + +static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(to); + struct iomap_dio *dio; + ssize_t ret; + + if (count == 0) + return 0; /* skip atime update */ + + trace_f2fs_direct_IO_enter(inode, iocb, count, READ); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + ret = -EAGAIN; + goto out; + } + } else { + f2fs_down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_READ counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_READ); + dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, + &f2fs_iomap_dio_read_ops, 0, NULL, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_READ); + } else { + ret = iomap_dio_complete(dio); + } + + f2fs_up_read(&fi->i_gc_rwsem[READ]); + + file_accessed(file); +out: + trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret); + return ret; +} + +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; + ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - return generic_file_read_iter(iocb, iter); + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } + + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); + return ret; } -static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - ssize_t ret; + ssize_t count; + int err; - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - ret = -EIO; - goto out; + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return -EPERM; + + count = generic_write_checks(iocb, from); + if (count <= 0) + return count; + + err = file_modified(file); + if (err) + return err; + return count; +} + +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, + bool dio) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (fault_in_iov_iter_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; } - if (!f2fs_is_compress_backend_ready(inode)) + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, 1, flag); + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + +static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb, from); + current->backing_dev_info = NULL; + + if (ret > 0) { + iocb->ki_pos += ret; + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); + } + return ret; +} + +static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_WRITE); + if (error) + return error; + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { + .end_io = f2fs_dio_write_end_io, +}; + +static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, + bool *may_need_sync) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const bool do_opu = f2fs_lfs_mode(sbi); + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + unsigned int dio_flags; + struct iomap_dio *dio; + ssize_t ret; + + trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE); + if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) { + /* f2fs_convert_inline_inode() and block allocation can block */ + if (f2fs_has_inline_data(inode) || + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EAGAIN; + goto out; + } + + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + ret = -EAGAIN; + goto out; + } + if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); ret = -EAGAIN; goto out; } } else { - inode_lock(inode); + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + f2fs_down_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu) + f2fs_down_read(&fi->i_gc_rwsem[READ]); } - ret = generic_write_checks(iocb, from); - if (ret > 0) { - bool preallocated = false; - size_t target_size = 0; - int err; - - if (iov_iter_fault_in_readable(from, iov_iter_count(from))) - set_inode_flag(inode, FI_NO_PREALLOC); - - if ((iocb->ki_flags & IOCB_NOWAIT)) { - if (!f2fs_overwrite_io(inode, iocb->ki_pos, - iov_iter_count(from)) || - f2fs_has_inline_data(inode) || - f2fs_force_buffered_io(inode, iocb, from)) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = -EAGAIN; + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_WRITE counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_WRITE); + dio_flags = 0; + if (pos + count > inode->i_size) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, + &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_WRITE); + } else { + ret = iomap_dio_complete(dio); + } + + if (do_opu) + f2fs_up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + + if (ret < 0) + goto out; + if (pos + ret > inode->i_size) + f2fs_i_size_write(inode, pos + ret); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); + + if (iov_iter_count(from)) { + ssize_t ret2; + loff_t bufio_start_pos = iocb->ki_pos; + + /* + * The direct write was partial, so we need to fall back to a + * buffered write for the remainder. + */ + + ret2 = f2fs_buffered_write_iter(iocb, from); + if (iov_iter_count(from)) + f2fs_write_failed(inode, iocb->ki_pos); + if (ret2 < 0) + goto out; + + /* + * Ensure that the pagecache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + if (ret2 > 0) { + loff_t bufio_end_pos = bufio_start_pos + ret2 - 1; + + ret += ret2; + + ret2 = filemap_write_and_wait_range(file->f_mapping, + bufio_start_pos, + bufio_end_pos); + if (ret2 < 0) goto out; - } - goto write; + invalidate_mapping_pages(file->f_mapping, + bufio_start_pos >> PAGE_SHIFT, + bufio_end_pos >> PAGE_SHIFT); } + } else { + /* iomap_dio_rw() already handled the generic_write_sync(). */ + *may_need_sync = false; + } +out: + trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret); + return ret; +} - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - goto write; +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); + loff_t target_size; + bool dio; + bool may_need_sync = true; + int preallocated; + ssize_t ret; - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * Convert inline data for Direct I/O before entering - * f2fs_direct_IO(). - */ - err = f2fs_convert_inline_inode(inode); - if (err) - goto out_err; - /* - * If force_buffere_io() is true, we have to allocate - * blocks all the time, since f2fs_direct_IO will fall - * back to buffered IO. - */ - if (!f2fs_force_buffered_io(inode, iocb, from) && - allow_outplace_dio(inode, iocb, from)) - goto write; - } - preallocated = true; - target_size = iocb->ki_pos + iov_iter_count(from); + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + ret = -EIO; + goto out; + } - err = f2fs_preallocate_blocks(iocb, from); - if (err) { -out_err: - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) { + ret = -EAGAIN; goto out; } -write: - ret = __generic_file_write_iter(iocb, from); - clear_inode_flag(inode, FI_NO_PREALLOC); + } else { + inode_lock(inode); + } + + ret = f2fs_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; - /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) - f2fs_truncate(inode); + /* Determine whether we will do a direct write or a buffered write. */ + dio = f2fs_should_use_dio(inode, iocb, from); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from, dio); + if (preallocated < 0) { + ret = preallocated; + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: + /* Do the actual write. */ + ret = dio ? + f2fs_dio_write_iter(iocb, from, &may_need_sync) : + f2fs_buffered_write_iter(iocb, from); + + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); } + + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated && i_size_read(inode) < target_size) { + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); + } + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); +out_unlock: inode_unlock(inode); out: - trace_f2fs_file_write_iter(inode, iocb->ki_pos, - iov_iter_count(from), ret); - if (ret > 0) + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + if (ret > 0 && may_need_sync) ret = generic_write_sync(iocb, ret); return ret; } +static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, + int advice) +{ + struct address_space *mapping; + struct backing_dev_info *bdi; + struct inode *inode = file_inode(filp); + int err; + + if (advice == POSIX_FADV_SEQUENTIAL) { + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + mapping = filp->f_mapping; + if (!mapping || len < 0) + return -EINVAL; + + bdi = inode_to_bdi(mapping->host); + filp->f_ra.ra_pages = bdi->ra_pages * + F2FS_I_SB(inode)->seq_file_ra_mul; + spin_lock(&filp->f_lock); + filp->f_mode &= ~FMODE_RANDOM; + spin_unlock(&filp->f_lock); + return 0; + } + + err = generic_fadvise(filp, offset, len, advice); + if (!err && advice == POSIX_FADV_DONTNEED && + test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + + return err; +} + #ifdef CONFIG_COMPAT +struct compat_f2fs_gc_range { + u32 sync; + compat_u64 start; + compat_u64 len; +}; +#define F2FS_IOC32_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11,\ + struct compat_f2fs_gc_range) + +static int f2fs_compat_ioc_gc_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_gc_range __user *urange; + struct f2fs_gc_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.sync, &urange->sync); + err |= get_user(range.start, &urange->start); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_gc_range(file, &range); +} + +struct compat_f2fs_move_range { + u32 dst_fd; + compat_u64 pos_in; + compat_u64 pos_out; + compat_u64 len; +}; +#define F2FS_IOC32_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct compat_f2fs_move_range) + +static int f2fs_compat_ioc_move_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_move_range __user *urange; + struct f2fs_move_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.dst_fd, &urange->dst_fd); + err |= get_user(range.pos_in, &urange->pos_in); + err |= get_user(range.pos_out, &urange->pos_out); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_move_range(file, &range); +} + long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(file)))) + return -ENOSPC; + switch (cmd) { - case F2FS_IOC32_GETFLAGS: - cmd = F2FS_IOC_GETFLAGS; - break; - case F2FS_IOC32_SETFLAGS: - cmd = F2FS_IOC_SETFLAGS; - break; - case F2FS_IOC32_GETVERSION: - cmd = F2FS_IOC_GETVERSION; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; break; + case F2FS_IOC32_GARBAGE_COLLECT_RANGE: + return f2fs_compat_ioc_gc_range(file, arg); + case F2FS_IOC32_MOVE_RANGE: + return f2fs_compat_ioc_move_range(file, arg); case F2FS_IOC_START_ATOMIC_WRITE: case F2FS_IOC_COMMIT_ATOMIC_WRITE: case F2FS_IOC_START_VOLATILE_WRITE: case F2FS_IOC_RELEASE_VOLATILE_WRITE: - case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_ABORT_ATOMIC_WRITE: case F2FS_IOC_SHUTDOWN: case FITRIM: - case F2FS_IOC_SET_ENCRYPTION_POLICY: - case F2FS_IOC_GET_ENCRYPTION_PWSALT: - case F2FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY_EX: case FS_IOC_ADD_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: case F2FS_IOC_GARBAGE_COLLECT: - case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: case F2FS_IOC_GET_FEATURES: - case F2FS_IOC_FSGETXATTR: - case F2FS_IOC_FSSETXATTR: case F2FS_IOC_GET_PIN_FILE: case F2FS_IOC_SET_PIN_FILE: case F2FS_IOC_PRECACHE_EXTENTS: case F2FS_IOC_RESIZE_FS: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: - case F2FS_IOC_GET_VOLUME_NAME: - case F2FS_IOC_SET_VOLUME_NAME: + case FS_IOC_READ_VERITY_METADATA: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: + case F2FS_IOC_GET_COMPRESS_BLOCKS: + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + case F2FS_IOC_SEC_TRIM_FILE: + case F2FS_IOC_GET_COMPRESS_OPTION: + case F2FS_IOC_SET_COMPRESS_OPTION: + case F2FS_IOC_DECOMPRESS_FILE: + case F2FS_IOC_COMPRESS_FILE: break; default: return -ENOIOCTLCMD; } - return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); + return __f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif @@ -3652,4 +4866,5 @@ const struct file_operations f2fs_file_operations = { #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .fadvise = f2fs_file_fadvise, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index db8725d473b5..4546e01b2ee0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -7,35 +7,54 @@ */ #include <linux/fs.h> #include <linux/module.h> -#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> #include <linux/delay.h> #include <linux/freezer.h> +#include <linux/sched/signal.h> +#include <linux/random.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> +static struct kmem_cache *victim_entry_slab; + +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len); + static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; set_freezable(); do { + bool sync_mode, foreground = false; + wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || + waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + foreground = true; + /* give it a try one time */ if (gc_th->gc_wake) gc_th->gc_wake = 0; @@ -55,7 +74,8 @@ static int gc_thread_func(void *data) if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); } if (!sb_start_write_trylock(sbi->sb)) { @@ -76,20 +96,34 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT) { + if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_urgent_high_lock); + } + + if (sbi->gc_mode == GC_URGENT_HIGH || + sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); goto do_gc; } - if (!down_write_trylock(&sbi->gc_lock)) { + if (foreground) { + f2fs_down_write(&sbi->gc_lock); + goto do_gc; + } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } @@ -99,17 +133,34 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi->stat_info); + if (!foreground) + stat_inc_bggc_count(sbi->stat_info); + + sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground) + sync_mode = false; + + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) - wait_ms = gc_th->no_gc_sleep_time; + if (f2fs_gc(sbi, &gc_control)) { + /* don't bother wait_ms by foreground gc */ + if (!foreground) + wait_ms = gc_th->no_gc_sleep_time; + } + + if (foreground) + wake_up_all(&gc_th->fggc_wq); trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); /* balancing f2fs's metadata periodically */ - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, true); next: sb_end_write(sbi->sb); @@ -134,15 +185,16 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_wake= 0; + gc_th->gc_wake = 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { err = PTR_ERR(gc_th->f2fs_gc_task); - kvfree(gc_th); + kfree(gc_th); sbi->gc_thread = NULL; } out: @@ -152,26 +204,41 @@ out: void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); - kvfree(gc_th); + wake_up_all(&gc_th->fggc_wq); + kfree(gc_th); sbi->gc_thread = NULL; } static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { - int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode; + + if (gc_type == BG_GC) { + if (sbi->am.atgc_enabled) + gc_mode = GC_AT; + else + gc_mode = GC_CB; + } else { + gc_mode = GC_GREEDY; + } switch (sbi->gc_mode) { case GC_IDLE_CB: gc_mode = GC_CB; break; case GC_IDLE_GREEDY: - case GC_URGENT: + case GC_URGENT_HIGH: gc_mode = GC_GREEDY; break; + case GC_IDLE_AT: + gc_mode = GC_AT; + break; } + return gc_mode; } @@ -182,24 +249,41 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; - p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; + } else if (p->alloc_mode == AT_SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { p->gc_mode = select_gc_type(sbi, gc_type); - p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; - p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; + if (__is_large_section(sbi)) { + p->dirty_bitmap = dirty_i->dirty_secmap; + p->max_search = count_bits(p->dirty_bitmap, + 0, MAIN_SECS(sbi)); + } else { + p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; + } } - /* we need to check every dirty segments in the FG_GC case */ + /* + * adjust candidates range, should select all dirty segments for + * foreground GC and urgent GC cases. + */ if (gc_type != FG_GC && - (sbi->gc_mode != GC_URGENT) && + (sbi->gc_mode != GC_URGENT_HIGH) && + (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first in no_heap mode*/ - if (test_opt(sbi, NOHEAP) && + if (f2fs_need_rand_seg(sbi)) + p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else @@ -212,10 +296,16 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) return sbi->blocks_per_seg; + else if (p->alloc_mode == AT_SSR) + return UINT_MAX; + + /* LFS */ if (p->gc_mode == GC_GREEDY) return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; + else if (p->gc_mode == GC_AT) + return UINT_MAX; else /* No other gc_mode */ return 0; } @@ -249,13 +339,14 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) unsigned char age = 0; unsigned char u; unsigned int i; + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno); - for (i = 0; i < sbi->segs_per_sec; i++) + for (i = 0; i < usable_segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; vblocks = get_valid_blocks(sbi, segno, true); - mtime = div_u64(mtime, sbi->segs_per_sec); - vblocks = div_u64(vblocks, sbi->segs_per_sec); + mtime = div_u64(mtime, usable_segs_per_sec); + vblocks = div_u64(vblocks, usable_segs_per_sec); u = (vblocks * 100) >> sbi->log_blocks_per_seg; @@ -280,8 +371,11 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) return get_valid_blocks(sbi, segno, true); - else + else if (p->gc_mode == GC_CB) return get_cb_cost(sbi, segno); + + f2fs_bug_on(sbi, 1); + return 0; } static unsigned int count_bits(const unsigned long *addr, @@ -296,6 +390,318 @@ static unsigned int count_bits(const unsigned long *addr, return sum; } +static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno, + struct rb_node *parent, struct rb_node **p, + bool left_most) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + + ve = f2fs_kmem_cache_alloc(victim_entry_slab, + GFP_NOFS, true, NULL); + + ve->mtime = mtime; + ve->segno = segno; + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, &am->root, left_most); + + list_add_tail(&ve->list, &am->victim_list); + + am->victim_count++; + + return ve; +} + +static void insert_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct rb_node **p; + struct rb_node *parent = NULL; + bool left_most = true; + + p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most); + attach_victim_entry(sbi, mtime, segno, parent, p, left_most); +} + +static void add_victim_entry(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); + unsigned long long mtime = 0; + unsigned int i; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p->gc_mode == GC_AT && + get_valid_blocks(sbi, segno, true) == 0) + return; + } + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = div_u64(mtime, sbi->segs_per_sec); + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (mtime < sit_i->dirty_min_mtime) + sit_i->dirty_min_mtime = mtime; + if (mtime > sit_i->dirty_max_mtime) + sit_i->dirty_max_mtime = mtime; + + /* don't choose young section as candidate */ + if (sit_i->dirty_max_mtime - mtime < p->age_threshold) + return; + + insert_victim_entry(sbi, mtime, segno); +} + +static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *parent = NULL; + bool left_most; + + f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most); + + return parent; +} + +static void atgc_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long total_time; + unsigned long long age, u, accu; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int age_weight = am->age_weight; + unsigned int cost; + unsigned int iter = 0; + + if (max_mtime < min_mtime) + return; + + max_mtime += 1; + total_time = max_mtime - min_mtime; + + accu = div64_u64(ULLONG_MAX, total_time); + accu = min_t(unsigned long long, div_u64(accu, 100), + DEFAULT_ACCURACY_CLASS); + + node = rb_first_cached(root); +next: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) + return; + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip; + + /* age = 10000 * x% * 60 */ + age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * + age_weight; + + vblocks = get_valid_blocks(sbi, ve->segno, true); + f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); + + /* u = 10000 * x% * 40 */ + u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * + (100 - age_weight); + + f2fs_bug_on(sbi, age + u >= UINT_MAX); + + cost = UINT_MAX - (age + u); + iter++; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip: + if (iter < dirty_threshold) { + node = rb_next(node); + goto next; + } +} + +/* + * select candidates around source section in range of + * [target - dirty_threshold, target + dirty_threshold] + */ +static void atssr_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long age; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int seg_blocks = sbi->blocks_per_seg; + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int cost; + unsigned int iter = 0; + int stage = 0; + + if (max_mtime < min_mtime) + return; + max_mtime += 1; +next_stage: + node = lookup_central_victim(sbi, p); +next_node: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) { + if (stage == 0) + goto skip_stage; + return; + } + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip_node; + + age = max_mtime - ve->mtime; + + vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; + f2fs_bug_on(sbi, !vblocks); + + /* rare case */ + if (vblocks == seg_blocks) + goto skip_node; + + iter++; + + age = max_mtime - abs(p->age - age); + cost = UINT_MAX - vblocks; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip_node: + if (iter < dirty_threshold) { + if (stage == 0) + node = rb_prev(node); + else if (stage == 1) + node = rb_next(node); + goto next_node; + } +skip_stage: + if (stage < 1) { + stage++; + iter = 0; + goto next_stage; + } +} +static void lookup_victim_by_age(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &sbi->am.root, true)); + + if (p->gc_mode == GC_AT) + atgc_lookup_victim(sbi, p); + else if (p->alloc_mode == AT_SSR) + atssr_lookup_victim(sbi, p); + else + f2fs_bug_on(sbi, 1); +} + +static void release_victim_entry(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve, *tmp; + + list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { + list_del(&ve->list); + kmem_cache_free(victim_entry_slab, ve); + am->victim_count--; + } + + am->root = RB_ROOT_CACHED; + + f2fs_bug_on(sbi, am->victim_count); + f2fs_bug_on(sbi, !list_empty(&am->victim_list)); +} + +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -305,31 +711,51 @@ static unsigned int count_bits(const unsigned long *addr, * which has minimum valid blocks and removes it from dirty seglist. */ static int get_victim_by_default(struct f2fs_sb_info *sbi, - unsigned int *result, int gc_type, int type, char alloc_mode) + unsigned int *result, int gc_type, int type, + char alloc_mode, unsigned long long age) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment; - unsigned int nsearched = 0; + unsigned int nsearched; + bool is_atgc; + int ret = 0; mutex_lock(&dirty_i->seglist_lock); last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; p.alloc_mode = alloc_mode; - select_policy(sbi, gc_type, type, &p); + p.age = age; + p.age_threshold = sbi->am.age_threshold; +retry: + select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; + p.oldest_age = 0; p.min_cost = get_max_cost(sbi, &p); + is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); + nsearched = 0; + + if (is_atgc) + SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; + if (*result != NULL_SEGNO) { - if (get_valid_blocks(sbi, *result, false) && - !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + if (!get_valid_blocks(sbi, *result, false)) { + ret = -ENODATA; + goto out; + } + + if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + ret = -EBUSY; + else p.min_segno = *result; goto out; } + ret = -ENODATA; if (p.max_search == 0) goto out; @@ -357,10 +783,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } while (1) { - unsigned long cost; - unsigned int segno; - - segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); + unsigned long cost, *dirty_bitmap; + unsigned int unit_no, segno; + + dirty_bitmap = p.dirty_bitmap; + unit_no = find_next_bit(dirty_bitmap, + last_segment / p.ofs_unit, + p.offset / p.ofs_unit); + segno = unit_no * p.ofs_unit; if (segno >= last_segment) { if (sm->last_victim[p.gc_mode]) { last_segment = @@ -373,14 +803,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } p.offset = segno + p.ofs_unit; - if (p.ofs_unit > 1) { - p.offset -= segno % p.ofs_unit; - nsearched += count_bits(p.dirty_segmap, - p.offset - p.ofs_unit, - p.ofs_unit); - } else { - nsearched++; - } + nsearched++; #ifdef CONFIG_F2FS_CHECK_FS /* @@ -396,14 +819,38 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (sec_usage_check(sbi, secno)) goto next; + /* Don't touch checkpointed data */ - if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && - get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode != SSR)) - goto next; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p.alloc_mode == LFS) { + /* + * LFS is set to find source section during GC. + * The victim should have no checkpointed data. + */ + if (get_ckpt_valid_blocks(sbi, segno, true)) + goto next; + } else { + /* + * SSR | AT_SSR are set to find target segment + * for writes which can be full by checkpointed + * and newly written blocks. + */ + if (!f2fs_segment_has_free_slot(sbi, segno)) + goto next; + } + } + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + + if (is_atgc) { + add_victim_entry(sbi, &p, segno); + goto next; + } + cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { @@ -413,14 +860,28 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, next: if (nsearched >= p.max_search) { if (!sm->last_victim[p.gc_mode] && segno <= last_victim) - sm->last_victim[p.gc_mode] = last_victim + 1; + sm->last_victim[p.gc_mode] = + last_victim + p.ofs_unit; else - sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + p.ofs_unit; sm->last_victim[p.gc_mode] %= (MAIN_SECS(sbi) * sbi->segs_per_sec); break; } } + + /* get victim for GC_AT/AT_SSR */ + if (is_atgc) { + lookup_victim_by_age(sbi, &p); + release_victim_entry(sbi); + } + + if (is_atgc && p.min_segno == NULL_SEGNO && + sm->elapsed_time < p.age_threshold) { + p.age_threshold = 0; + goto retry; + } + if (p.min_segno != NULL_SEGNO) { got_it: *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; @@ -432,6 +893,7 @@ got_result: else set_bit(secno, dirty_i->victim_secmap); } + ret = 0; } out: @@ -441,7 +903,7 @@ out: prefree_segments(sbi), free_segments(sbi)); mutex_unlock(&dirty_i->seglist_lock); - return (p.min_segno == NULL_SEGNO) ? 0 : 1; + return ret; } static const struct victim_selection default_v_ops = { @@ -466,7 +928,8 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) iput(inode); return; } - new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, + GFP_NOFS, true, NULL); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); @@ -476,6 +939,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); @@ -512,6 +976,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, int phase = 0; bool fggc = (gc_type == FG_GC); int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); @@ -521,7 +986,7 @@ next_step: if (fggc && phase == 2) atomic_inc(&sbi->wb_sync_req[NODE]); - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; struct node_info ni; @@ -556,7 +1021,7 @@ next_step: continue; } - if (f2fs_get_node_info(sbi, nid, &ni)) { + if (f2fs_get_node_info(sbi, nid, &ni, false)) { f2fs_put_page(node_page, 1); continue; } @@ -599,9 +1064,11 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; } else { int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; } return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); @@ -612,7 +1079,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node; + unsigned int ofs_in_node, max_addrs; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -622,7 +1089,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - if (f2fs_get_node_info(sbi, nid, dni)) { + if (f2fs_get_node_info(sbi, nid, dni, false)) { f2fs_put_page(node_page, 1); return false; } @@ -633,8 +1100,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_put_page(node_page, 1); + return false; + } + + max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : + DEF_ADDRS_PER_BLOCK; + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", + ofs_in_node, dni->ino, dni->nid, max_addrs); + return false; + } + *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); + source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) { @@ -644,9 +1124,9 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (unlikely(check_valid_map(sbi, segno, offset))) { if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { - f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n", - blkaddr, source_blkaddr, segno); - f2fs_bug_on(sbi, 1); + f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", + blkaddr, source_blkaddr, segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); } } #endif @@ -684,6 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -702,6 +1183,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: @@ -730,6 +1212,10 @@ got_it: goto put_encrypted_page; f2fs_put_page(fio.encrypted_page, 0); f2fs_put_page(page, 1); + + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); + return 0; put_encrypted_page: f2fs_put_page(fio.encrypted_page, 1); @@ -762,7 +1248,10 @@ static int move_data_block(struct inode *inode, block_t bidx, struct page *page, *mpage; block_t newaddr; int err = 0; - bool lfs_mode = test_opt(fio.sbi, LFS); + bool lfs_mode = f2fs_lfs_mode(fio.sbi); + int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && + (fio.sbi->gc_mode != GC_URGENT_HIGH) ? + CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -774,18 +1263,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } - - if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); - err = -EAGAIN; - goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -806,23 +1286,23 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto put_out; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) - down_write(&fio.sbi->io_order_lock); + f2fs_down_write(&fio.sbi->io_order_lock); mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), fio.old_blkaddr, false); - if (!mpage) + if (!mpage) { + err = -ENOMEM; goto up_out; + } fio.encrypted_page = mpage; @@ -833,6 +1313,12 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_put_page(mpage, 1); goto up_out; } + + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); + lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || !PageUptodate(mpage))) { @@ -842,8 +1328,11 @@ static int move_data_block(struct inode *inode, block_t bidx, } } + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA, NULL, false); + &sum, type, NULL); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -860,6 +1349,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_put_page(mpage, 1); invalidate_mapping_pages(META_MAPPING(fio.sbi), fio.old_blkaddr, fio.old_blkaddr); + f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr); set_page_dirty(fio.encrypted_page); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -868,9 +1358,6 @@ static int move_data_block(struct inode *inode, block_t bidx, set_page_writeback(fio.encrypted_page); ClearPageError(page); - /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; @@ -882,7 +1369,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } - f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -893,10 +1380,10 @@ put_page_out: recover_block: if (err) f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, - true, true); + true, true, true); up_out: if (lfs_mode) - up_write(&fio.sbi->io_order_lock); + f2fs_up_write(&fio.sbi->io_order_lock); put_out: f2fs_put_dnode(&dn); out: @@ -919,18 +1406,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -938,7 +1416,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } set_page_dirty(page); - set_cold_data(page); + set_page_private_gcing(page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -964,13 +1442,13 @@ retry: f2fs_remove_dirty_inode(inode); } - set_cold_data(page); + set_page_private_gcing(page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_cold_data(page); + clear_page_private_gcing(page); if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) @@ -990,7 +1468,8 @@ out: * the victim data block is ignored. */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, - struct gc_inode_list *gc_list, unsigned int segno, int gc_type) + struct gc_inode_list *gc_list, unsigned int segno, int gc_type, + bool force_migrate) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -998,13 +1477,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, int off; int phase = 0; int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ @@ -1018,8 +1498,8 @@ next_step: * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || - get_valid_blocks(sbi, segno, false) == - sbi->blocks_per_seg) + (!force_migrate && get_valid_blocks(sbi, segno, true) == + CAP_BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1048,13 +1528,20 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); + if (IS_ERR(inode) || is_bad_inode(inode) || + special_file(inode->i_mode)) continue; + + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { + iput(inode); + return submitted; } - if (!down_write_trylock( + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); sbi->skipped_gc_rwsem++; @@ -1067,7 +1554,7 @@ next_step: if (f2fs_post_read_required(inode)) { int err = ra_data_block(inode, start_bidx); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) { iput(inode); continue; @@ -1078,7 +1565,7 @@ next_step: data_page = f2fs_get_read_data_page(inode, start_bidx, REQ_RAHEAD, true); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -1097,12 +1584,14 @@ next_step: int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; continue; - if (!down_write_trylock( + } + if (!f2fs_down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); continue; } locked = true; @@ -1125,8 +1614,8 @@ next_step: submitted++; if (locked) { - up_write(&fi->i_gc_rwsem[WRITE]); - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); } stat_inc_data_blk_count(sbi, 1, gc_type); @@ -1147,14 +1636,15 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, - NO_CHECK_TYPE, LFS); + NO_CHECK_TYPE, LFS, 0); up_write(&sit_i->sentry_lock); return ret; } static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno, - struct gc_inode_list *gc_list, int gc_type) + struct gc_inode_list *gc_list, int gc_type, + bool force_migrate) { struct page *sum_page; struct f2fs_summary_block *sum; @@ -1169,6 +1659,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, if (__is_large_section(sbi)) end_segno = rounddown(end_segno, sbi->segs_per_sec); + /* + * zone-capacity can be less than zone-size in zoned devices, + * resulting in less than expected usable segments in the zone, + * calculate the end segno in the zone which can be garbage collected + */ + if (f2fs_sb_has_blkzoned(sbi)) + end_segno -= sbi->segs_per_sec - + f2fs_usable_segs_in_sec(sbi, segno); + + sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); + /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), @@ -1203,7 +1704,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, if (get_valid_blocks(sbi, segno, false) == 0) goto freed; - if (__is_large_section(sbi) && + if (gc_type == BG_GC && __is_large_section(sbi) && migrated >= sbi->migration_granularity) goto skip; if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) @@ -1214,7 +1715,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } @@ -1230,15 +1732,17 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); else submitted += gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); + segno, gc_type, + force_migrate); stat_inc_seg_count(sbi, type, gc_type); + sbi->gc_reclaimed_segs[sbi->gc_mode]++; + migrated++; freed: if (gc_type == FG_GC && get_valid_blocks(sbi, segno, false) == 0) seg_freed++; - migrated++; if (__is_large_section(sbi) && segno + 1 < end_segno) sbi->next_victim_seg[gc_type] = segno + 1; @@ -1257,23 +1761,21 @@ skip: return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1284,7 +1786,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1301,8 +1802,7 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1312,54 +1812,69 @@ gc_more: } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } - if (!__get_victim(sbi, &segno, gc_type)) { - ret = -ENODATA; +retry: + ret = __get_victim(sbi, &segno, gc_type); + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); - if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) - skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; - if (gc_type == FG_GC && seg_freed) + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; + } - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { + ret = f2fs_write_checkpoint(sbi, &cpc); + goto stop; } + } - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) - ret = f2fs_write_checkpoint(sbi, &cpc); + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; } +go_gc_more: + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; + + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1370,15 +1885,47 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } +int __init f2fs_create_garbage_collection_cache(void) +{ + victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", + sizeof(struct victim_entry)); + if (!victim_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_garbage_collection_cache(void) +{ + kmem_cache_destroy(victim_entry_slab); +} + +static void init_atgc_management(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + + if (test_opt(sbi, ATGC) && + SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) + am->atgc_enabled = true; + + am->root = RB_ROOT_CACHED; + INIT_LIST_HEAD(&am->victim_list); + am->victim_count = 0; + + am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; + am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; + am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; + am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; +} + void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1389,18 +1936,37 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; + + init_atgc_management(sbi); } -static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, - unsigned int end) +static int free_segment_range(struct f2fs_sb_info *sbi, + unsigned int secs, bool gc_only) { - int type; - unsigned int segno, next_inuse; + unsigned int segno, next_inuse, start, end; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + int gc_mode, gc_type; int err = 0; + int type; + + /* Force block allocation for GC */ + MAIN_SECS(sbi) -= secs; + start = MAIN_SECS(sbi) * sbi->segs_per_sec; + end = MAIN_SEGS(sbi) - 1; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) + if (SIT_I(sbi)->last_victim[gc_mode] >= start) + SIT_I(sbi)->last_victim[gc_mode] = 0; + + for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) + if (sbi->next_victim_seg[gc_type] >= start) + sbi->next_victim_seg[gc_type] = NULL_SEGNO; + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); /* Move out cursegs from the target range */ - for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++) - allocate_segment_for_resize(sbi, type, start, end); + for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) + f2fs_allocate_segment_for_resize(sbi, type, start, end); /* do GC to move out valid blocks in the range */ for (segno = start; segno <= end; segno += sbi->segs_per_sec) { @@ -1409,18 +1975,24 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - down_write(&sbi->gc_lock); - do_garbage_collect(sbi, segno, &gc_list, FG_GC); - up_write(&sbi->gc_lock); + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true); put_gc_inode(&gc_list); - if (get_valid_blocks(sbi, segno, true)) - return -EAGAIN; + if (!gc_only && get_valid_blocks(sbi, segno, true)) { + err = -EAGAIN; + goto out; + } + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out; + } } + if (gc_only) + goto out; - err = f2fs_sync_fs(sbi->sb, 1); + err = f2fs_write_checkpoint(sbi, &cpc); if (err) - return err; + goto out; next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start); if (next_inuse <= end) { @@ -1428,18 +2000,27 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, next_inuse); f2fs_bug_on(sbi, 1); } +out: + MAIN_SECS(sbi) += secs; return err; } static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) { struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); - int section_count = le32_to_cpu(raw_sb->section_count); - int segment_count = le32_to_cpu(raw_sb->segment_count); - int segment_count_main = le32_to_cpu(raw_sb->segment_count_main); - long long block_count = le64_to_cpu(raw_sb->block_count); + int section_count; + int segment_count; + int segment_count_main; + long long block_count; int segs = secs * sbi->segs_per_sec; + f2fs_down_write(&sbi->sb_lock); + + section_count = le32_to_cpu(raw_sb->section_count); + segment_count = le32_to_cpu(raw_sb->segment_count); + segment_count_main = le32_to_cpu(raw_sb->segment_count_main); + block_count = le64_to_cpu(raw_sb->block_count); + raw_sb->section_count = cpu_to_le32(section_count + secs); raw_sb->segment_count = cpu_to_le32(segment_count + segs); raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); @@ -1453,6 +2034,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) raw_sb->devs[last_dev].total_segments = cpu_to_le32(dev_segs + segs); } + + f2fs_up_write(&sbi->sb_lock); } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) @@ -1464,6 +2047,7 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; + MAIN_SECS(sbi) += secs; FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); @@ -1485,8 +2069,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) { __u64 old_block_count, shrunk_blocks; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; unsigned int secs; - int gc_mode, gc_type; int err = 0; __u32 rem; @@ -1521,75 +2105,86 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) return -EINVAL; } - freeze_bdev(sbi->sb->s_bdev); - shrunk_blocks = old_block_count - block_count; secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); + + /* stop other GC */ + if (!f2fs_down_write_trylock(&sbi->gc_lock)) + return -EAGAIN; + + /* stop CP to protect MAIN_SEC in free_segment_range */ + f2fs_lock_op(sbi); + spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + sbi->current_reserved_blocks + sbi->unusable_block_count + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) err = -ENOSPC; - else - sbi->user_block_count -= shrunk_blocks; spin_unlock(&sbi->stat_lock); - if (err) { - thaw_bdev(sbi->sb->s_bdev, sbi->sb); - return err; - } - mutex_lock(&sbi->resize_mutex); - set_sbi_flag(sbi, SBI_IS_RESIZEFS); + if (err) + goto out_unlock; - mutex_lock(&DIRTY_I(sbi)->seglist_lock); + err = free_segment_range(sbi, secs, true); - MAIN_SECS(sbi) -= secs; +out_unlock: + f2fs_unlock_op(sbi); + f2fs_up_write(&sbi->gc_lock); + if (err) + return err; - for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) - if (SIT_I(sbi)->last_victim[gc_mode] >= - MAIN_SECS(sbi) * sbi->segs_per_sec) - SIT_I(sbi)->last_victim[gc_mode] = 0; + set_sbi_flag(sbi, SBI_IS_RESIZEFS); - for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) - if (sbi->next_victim_seg[gc_type] >= - MAIN_SECS(sbi) * sbi->segs_per_sec) - sbi->next_victim_seg[gc_type] = NULL_SEGNO; + freeze_super(sbi->sb); + f2fs_down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->cp_global_sem); - mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + else + sbi->user_block_count -= shrunk_blocks; + spin_unlock(&sbi->stat_lock); + if (err) + goto out_err; - err = free_segment_range(sbi, MAIN_SECS(sbi) * sbi->segs_per_sec, - MAIN_SEGS(sbi) - 1); + err = free_segment_range(sbi, secs, false); if (err) - goto out; + goto recover_out; update_sb_metadata(sbi, -secs); err = f2fs_commit_super(sbi, false); if (err) { update_sb_metadata(sbi, secs); - goto out; + goto recover_out; } update_fs_metadata(sbi, -secs); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); - err = f2fs_sync_fs(sbi->sb, 1); + set_sbi_flag(sbi, SBI_IS_DIRTY); + + err = f2fs_write_checkpoint(sbi, &cpc); if (err) { update_fs_metadata(sbi, secs); update_sb_metadata(sbi, secs); f2fs_commit_super(sbi, false); } -out: +recover_out: if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); - MAIN_SECS(sbi) += secs; spin_lock(&sbi->stat_lock); sbi->user_block_count += shrunk_blocks; spin_unlock(&sbi->stat_lock); } +out_err: + f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->gc_lock); + thaw_super(sbi->sb); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); - mutex_unlock(&sbi->resize_mutex); - thaw_bdev(sbi->sb->s_bdev, sbi->sb); return err; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index bbac9d3787bd..19b956c2d697 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/gc.h * @@ -14,6 +14,14 @@ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ + +/* choose candidates from sections which has age of more than 7 days */ +#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7) +#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */ +#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */ +#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */ +#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */ + #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ @@ -34,6 +42,12 @@ struct f2fs_gc_kthread { /* for changing gc mode */ unsigned int gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ }; struct gc_inode_list { @@ -41,27 +55,78 @@ struct gc_inode_list { struct radix_tree_root iroot; }; +struct victim_info { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* section No. */ +}; + +struct victim_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ + }; + struct victim_info vi; /* victim info */ + }; + struct list_head list; +}; + /* * inline functions */ + +/* + * On a Zoned device zone-capacity can be less than zone-size and if + * zone-capacity is not aligned to f2fs segment size(2MB), then the segment + * starting just before zone-capacity has some blocks spanning across the + * zone-capacity, these blocks are not usable. + * Such spanning segments can be in free list so calculate the sum of usable + * blocks in currently free segments including normal and spanning segments. + */ +static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi) +{ + block_t free_seg_blks = 0; + struct free_segmap_info *free_i = FREE_I(sbi); + int j; + + spin_lock(&free_i->segmap_lock); + for (j = 0; j < MAIN_SEGS(sbi); j++) + if (!test_bit(j, free_i->free_segmap)) + free_seg_blks += f2fs_usable_blks_in_seg(sbi, j); + spin_unlock(&free_i->segmap_lock); + + return free_seg_blks; +} + +static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return free_segs_blk_count_zoned(sbi); + + return free_segments(sbi) << sbi->log_blocks_per_seg; +} + static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) { - if (free_segments(sbi) < overprovision_segments(sbi)) + block_t free_blks, ovp_blks; + + free_blks = free_segs_blk_count(sbi); + ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + + if (free_blks < ovp_blks) return 0; - else - return (free_segments(sbi) - overprovision_segments(sbi)) - << sbi->log_blocks_per_seg; + + return free_blks - ovp_blks; } -static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_invalid_user_blocks(block_t user_block_count) { - return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; + return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100; } -static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks) { - block_t reclaimable_user_blocks = sbi->user_block_count - - written_block_count(sbi); return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } @@ -96,15 +161,16 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) { - block_t invalid_user_blocks = sbi->user_block_count - - written_block_count(sbi); + block_t user_block_count = sbi->user_block_count; + block_t invalid_user_blocks = user_block_count - + written_block_count(sbi); /* * Background GC is triggered with the following conditions. * 1. There are a number of invalid blocks. * 2. There is not enough free space. */ - if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && - free_user_blocks(sbi) < limit_free_user_blocks(sbi)) - return true; - return false; + return (invalid_user_blocks > + limit_invalid_user_blocks(user_block_count) && + free_user_blocks(sbi) < + limit_free_user_blocks(invalid_user_blocks)); } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 5bc4dcd8fc03..049ce50cec9b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -12,7 +12,6 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> -#include <linux/cryptohash.h> #include <linux/pagemap.h> #include <linux/unicode.h> @@ -68,22 +67,9 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info, - struct fscrypt_name *fname) +static u32 TEA_hash_name(const u8 *p, size_t len) { - __u32 hash; - f2fs_hash_t f2fs_hash; - const unsigned char *p; __u32 in[8], buf[4]; - const unsigned char *name = name_info->name; - size_t len = name_info->len; - - /* encrypted bigname case */ - if (fname && !fname->disk_name.name) - return cpu_to_le32(fname->hash); - - if (is_dot_dotdot(name_info)) - return 0; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -91,7 +77,6 @@ static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info, buf[2] = 0x98badcfe; buf[3] = 0x10325476; - p = name; while (1) { str2hashbuf(p, len, in, 4); TEA_transform(buf, in); @@ -100,41 +85,53 @@ static f2fs_hash_t __f2fs_dentry_hash(const struct qstr *name_info, break; len -= 16; } - hash = buf[0]; - f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); - return f2fs_hash; + return buf[0] & ~F2FS_HASH_COL_BIT; } -f2fs_hash_t f2fs_dentry_hash(const struct inode *dir, - const struct qstr *name_info, struct fscrypt_name *fname) +/* + * Compute @fname->hash. For all directories, @fname->disk_name must be set. + * For casefolded directories, @fname->usr_fname must be set, and also + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". + */ +void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { -#ifdef CONFIG_UNICODE - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - const struct unicode_map *um = sbi->s_encoding; - int r, dlen; - unsigned char *buff; - struct qstr folded; - - if (!name_info->len || !IS_CASEFOLDED(dir)) - goto opaque_seq; - - buff = f2fs_kzalloc(sbi, sizeof(char) * PATH_MAX, GFP_KERNEL); - if (!buff) - return -ENOMEM; - - dlen = utf8_casefold(um, name_info, buff, PATH_MAX); - if (dlen < 0) { - kvfree(buff); - goto opaque_seq; + const u8 *name = fname->disk_name.name; + size_t len = fname->disk_name.len; + + WARN_ON_ONCE(!name); + + if (is_dot_dotdot(name, len)) { + fname->hash = 0; + return; } - folded.name = buff; - folded.len = dlen; - r = __f2fs_dentry_hash(&folded, fname); - kvfree(buff); - return r; +#if IS_ENABLED(CONFIG_UNICODE) + if (IS_CASEFOLDED(dir)) { + /* + * If the casefolded name is provided, hash it instead of the + * on-disk name. If the casefolded name is *not* provided, that + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). + */ + WARN_ON_ONCE(!fname->usr_fname->name); + if (fname->cf_name.name) { + name = fname->cf_name.name; + len = fname->cf_name.len; + } else { + name = fname->usr_fname->name; + len = fname->usr_fname->len; + } + if (IS_ENCRYPTED(dir)) { + struct qstr tmp = QSTR_INIT(name, len); -opaque_seq: + fname->hash = + cpu_to_le32(fscrypt_fname_siphash(dir, &tmp)); + return; + } + } #endif - return __f2fs_dentry_hash(name_info, fname); + fname->hash = cpu_to_le32(TEA_hash_name(name, len)); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4167e5408151..21a495234ffd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -8,25 +8,46 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/fiemap.h> #include "f2fs.h" #include "node.h" +#include <trace/events/f2fs.h> -bool f2fs_may_inline_data(struct inode *inode) +static bool support_inline_data(struct inode *inode) { if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; + return true; +} - if (f2fs_post_read_required(inode)) +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) return false; - return true; + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); } bool f2fs_may_inline_dentry(struct inode *inode) @@ -43,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode) void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; - void *src_addr, *dst_addr; if (PageUptodate(page)) return; @@ -53,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage) zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(inode, ipage); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - flush_dcache_page(page); - kunmap_atomic(dst_addr); + memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + MAX_INLINE_DATA(inode)); if (!PageUptodate(page)) SetPageUptodate(page); } @@ -129,7 +146,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); if (err) { f2fs_truncate_data_blocks_range(dn, 1); f2fs_put_dnode(dn); @@ -143,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_sbi_flag(fio.sbi, SBI_NEED_FSCK); f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -171,7 +189,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* clear inline data and flag after data writeback */ f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); - clear_inline_node(dn->inode_page); + clear_page_private_inline(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -186,9 +204,14 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; - if (!f2fs_has_inline_data(inode)) + if (!f2fs_has_inline_data(inode) || + f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; + err = f2fs_dquot_initialize(inode); + if (err) + return err; + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -212,14 +235,14 @@ out: f2fs_put_page(page, 1); - f2fs_balance_fs(sbi, dn.node_changed); + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); return err; } int f2fs_write_inline_data(struct inode *inode, struct page *page) { - void *src_addr, *dst_addr; struct dnode_of_data dn; int err; @@ -236,10 +259,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); - src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(inode, dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - kunmap_atomic(src_addr); + memcpy_from_page(inline_data_addr(inode, dn.inode_page), + page, 0, MAX_INLINE_DATA(inode)); set_page_dirty(dn.inode_page); f2fs_clear_page_cache_dirty_tag(page); @@ -247,12 +268,12 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_inline_node(dn.inode_page); + clear_page_private_inline(dn.inode_page); f2fs_put_dnode(&dn); return 0; } -bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) +int f2fs_recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; @@ -264,7 +285,7 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) * [prev.] [next] of inline_data flag * o o -> recover inline_data * o x -> remove inline_data, and then recover data blocks - * x o -> remove inline_data, and then recover inline_data + * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ if (IS_INODE(npage)) @@ -274,7 +295,8 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: ipage = f2fs_get_node_page(sbi, inode->i_ino); - f2fs_bug_on(sbi, IS_ERR(ipage)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true, true); @@ -287,33 +309,38 @@ process_inline: set_page_dirty(ipage); f2fs_put_page(ipage, 1); - return true; + return 1; } if (f2fs_has_inline_data(inode)) { ipage = f2fs_get_node_page(sbi, inode->i_ino); - f2fs_bug_on(sbi, IS_ERR(ipage)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); + stat_dec_inline_inode(inode); clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false)) - return false; + int ret; + + ret = f2fs_truncate_blocks(inode, 0, false); + if (ret) + return ret; + stat_inc_inline_inode(inode); goto process_inline; } - return false; + return 0; } struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, - struct fscrypt_name *fname, struct page **res_page) + const struct f2fs_filename *fname, + struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; void *inline_dentry; - f2fs_hash_t namehash; ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { @@ -321,13 +348,15 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(dir, &name, fname); - inline_dentry = inline_data_addr(dir, ipage); make_dentry_ptr_inline(dir, &d, inline_dentry); - de = f2fs_find_target_dentry(fname, namehash, NULL, &d); + de = f2fs_find_target_dentry(&d, fname, NULL); unlock_page(ipage); + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + } if (de) *res_page = ipage; else @@ -384,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } @@ -443,7 +473,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) while (bit_pos < d.max) { struct f2fs_dir_entry *de; - struct qstr new_name; + struct f2fs_filename fname; nid_t ino; umode_t fake_mode; @@ -459,14 +489,19 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) continue; } - new_name.name = d.filename[bit_pos]; - new_name.len = le16_to_cpu(de->name_len); + /* + * We only need the disk_name and hash to move the dentry. + * We don't need the original or casefolded filenames. + */ + memset(&fname, 0, sizeof(fname)); + fname.disk_name.name = d.filename[bit_pos]; + fname.disk_name.len = le16_to_cpu(de->name_len); + fname.hash = de->hash_code; ino = le32_to_cpu(de->ino); fake_mode = f2fs_get_de_type(de) << S_SHIFT; - err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, - ino, fake_mode); + err = f2fs_add_regular_entry(dir, &fname, NULL, ino, fake_mode); if (err) goto punch_dentry_pages; @@ -515,7 +550,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, !f2fs_has_inline_xattr(dir)) F2FS_I(dir)->i_inline_xattr_size = 0; - kvfree(backup_dentry); + kfree(backup_dentry); return 0; recover: lock_page(ipage); @@ -526,7 +561,7 @@ recover: set_page_dirty(ipage); f2fs_put_page(ipage, 1); - kvfree(backup_dentry); + kfree(backup_dentry); return err; } @@ -543,7 +578,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; - struct fscrypt_name fname; + struct f2fs_filename fname; void *inline_dentry = NULL; int err = 0; @@ -552,19 +587,19 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) f2fs_lock_op(sbi); - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + err = f2fs_setup_filename(dir, &dentry->d_name, 0, &fname); if (err) goto out; ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); - goto out; + goto out_fname; } if (f2fs_has_enough_room(dir, ipage, &fname)) { f2fs_put_page(ipage, 1); - goto out; + goto out_fname; } inline_dentry = inline_data_addr(dir, ipage); @@ -572,22 +607,22 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) err = do_convert_inline_dir(dir, ipage, inline_dentry); if (!err) f2fs_put_page(ipage, 1); +out_fname: + f2fs_free_filename(&fname); out: f2fs_unlock_op(sbi); return err; } -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, - const struct qstr *orig_name, - struct inode *inode, nid_t ino, umode_t mode) +int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; - f2fs_hash_t name_hash; void *inline_dentry = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(new_name->len); + int slots = GET_DENTRY_SLOTS(fname->disk_name.len); struct page *page = NULL; int err = 0; @@ -608,9 +643,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, } if (inode) { - down_write(&F2FS_I(inode)->i_sem); - page = f2fs_init_inode_metadata(inode, dir, new_name, - orig_name, ipage); + f2fs_down_write(&F2FS_I(inode)->i_sem); + page = f2fs_init_inode_metadata(inode, dir, fname, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -619,8 +653,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true, true); - name_hash = f2fs_dentry_hash(dir, new_name, NULL); - f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, + bit_pos); set_page_dirty(ipage); @@ -638,7 +672,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_update_parent_metadata(dir, inode, 0); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); out: f2fs_put_page(ipage, 1); return err; @@ -766,7 +800,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); if (err) goto out; @@ -774,6 +808,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); + trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); out: f2fs_put_page(ipage, 1); return err; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 78c3f1d70f1d..9f0d3864d9f1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -8,8 +8,8 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> -#include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" @@ -18,6 +18,10 @@ #include <trace/events/f2fs.h> +#ifdef CONFIG_F2FS_FS_COMPRESSION +extern const struct address_space_operations f2fs_compress_aops; +#endif + void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { if (is_inode_flag_set(inode, FI_NEW_INODE)) @@ -77,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi, if (!__is_valid_data_blkaddr(addr)) return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } return 0; } @@ -256,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree) { + struct extent_info *ei = &fi->extent_tree->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -272,8 +278,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_has_inline_data(inode) && - (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); @@ -287,22 +292,59 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + __func__, inode->i_ino); + return false; + } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && fi->i_flags & F2FS_COMPR_FL && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { - if (ri->i_compress_algorithm >= COMPRESS_MAX) + if (ri->i_compress_algorithm >= COMPRESS_MAX) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "compress algorithm: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_compress_algorithm); return false; - if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks) + } + if (le64_to_cpu(ri->i_compr_blocks) > + SECTOR_TO_BLOCK(inode->i_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " + "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + __func__, inode->i_ino, + le64_to_cpu(ri->i_compr_blocks), + SECTOR_TO_BLOCK(inode->i_blocks)); return false; + } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || - ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "log cluster size: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_log_cluster_size); return false; + } } return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -345,13 +387,12 @@ static int do_read_inode(struct inode *inode) fi->i_flags = le32_to_cpu(ri->i_flags); if (S_ISREG(inode->i_mode)) fi->i_flags &= ~F2FS_PROJINHERIT_FL; - fi->flags = 0; + bitmap_zero(fi->flags, FI_MAX); fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - if (f2fs_init_extent_tree(inode, &ri->i_ext)) - set_page_dirty(node_page); + f2fs_init_extent_tree(inode, node_page); get_inline_info(inode, ri); @@ -376,6 +417,7 @@ static int do_read_inode(struct inode *inode) if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } @@ -385,6 +427,7 @@ static int do_read_inode(struct inode *inode) /* try to recover cold bit for non-dir inode */ if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) { + f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_cold_node(node_page, false); set_page_dirty(node_page); } @@ -425,29 +468,34 @@ static int do_read_inode(struct inode *inode) (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { - fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks); + atomic_set(&fi->i_compr_blocks, + le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; + fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag); fi->i_cluster_size = 1 << fi->i_log_cluster_size; set_inode_flag(inode, FI_COMPRESSED_FILE); } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + init_idisk_time(inode); f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); stat_inc_compr_inode(inode); - stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks)); return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -459,10 +507,21 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) + + if (is_meta_ino(sbi, ino)) goto make_now; ret = do_read_inode(inode); @@ -475,6 +534,17 @@ make_now: } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_COMPRESS_INO(sbi)) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + inode->i_mapping->a_ops = &f2fs_compress_aops; + /* + * generic_error_remove_page only truncates pages of regular + * inode + */ + inode->i_mode |= S_IFREG; +#endif + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -483,7 +553,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISLNK(inode->i_mode)) { if (file_is_encrypt(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; @@ -500,6 +570,15 @@ make_now: goto bad_inode; } f2fs_set_inode_flags(inode); + + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { + ret = f2fs_truncate(inode); + if (ret) + goto bad_inode; + file_dont_truncate(inode); + } + unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -518,7 +597,7 @@ retry: inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_NOFS); goto retry; } } @@ -602,9 +681,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_log_cluster_size)) { ri->i_compr_blocks = - cpu_to_le64(F2FS_I(inode)->i_compr_blocks); + cpu_to_le64(atomic_read( + &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; + ri->i_compress_flag = + cpu_to_le16(F2FS_I(inode)->i_compress_flag); ri->i_log_cluster_size = F2FS_I(inode)->i_log_cluster_size; } @@ -614,13 +696,9 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) /* deleted inode */ if (inode->i_nlink == 0) - clear_inline_node(node_page); - - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + clear_page_private_inline(node_page); + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif @@ -634,11 +712,13 @@ retry: node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); + if (err == -ENOMEM) { cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_UPDATE_INODE); } return; } @@ -666,7 +746,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) /* * We need to balance fs here to prevent from producing dirty node pages - * during the urgent cleaning time when runing out of free sections. + * during the urgent cleaning time when running out of free sections. */ f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) @@ -683,15 +763,18 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) + inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_COMPRESS_INO(sbi)) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); @@ -702,7 +785,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { err = 0; set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); @@ -712,7 +795,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); - sb_start_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); retry: @@ -728,8 +812,22 @@ retry: f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); - if (err == -ENOENT) + if (err == -ENOENT) { err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } } /* give more chances, if ENOMEM case */ @@ -743,7 +841,8 @@ retry: if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); } - sb_end_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_end_intwrite(inode->i_sb); no_delete: dquot_drop(inode); @@ -751,7 +850,8 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); stat_dec_compr_inode(inode); - stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_sub_compr_blocks(inode, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) @@ -759,7 +859,7 @@ no_delete: else f2fs_inode_synced(inode); - /* ino == 0, if f2fs_new_inode() was failed t*/ + /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); @@ -815,9 +915,10 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); + set_inode_flag(inode, FI_FREE_NID); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); goto out; } diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c new file mode 100644 index 000000000000..3166a8939ed4 --- /dev/null +++ b/fs/f2fs/iostat.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs iostat support + * + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +#define NUM_PREALLOC_IOSTAT_CTXS 128 +static struct kmem_cache *bio_iostat_ctx_cache; +static mempool_t *bio_iostat_ctx_pool; + +int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app write IOs */ + seq_puts(seq, "[WRITE]\n"); + seq_printf(seq, "app buffered data: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct data: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped data: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_IO]); + + /* print fs write IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_IO]); + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->rw_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->rw_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->rw_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->rw_iostat[FS_CP_META_IO]); + + /* print app read IOs */ + seq_puts(seq, "[READ]\n"); + seq_printf(seq, "app buffered data: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_READ_IO]); + seq_printf(seq, "app direct data: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_READ_IO]); + seq_printf(seq, "app mapped data: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_READ_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); + + /* print fs read IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_READ_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GDATA_READ_IO]); + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_READ_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_READ_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_READ_IO]); + + /* print other IOs */ + seq_puts(seq, "[OTHER]\n"); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->rw_iostat[FS_DISCARD]); + + return 0; +} + +static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) +{ + int io, idx = 0; + unsigned int cnt; + struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + for (idx = 0; idx < MAX_IO_TYPE; idx++) { + for (io = 0; io < NR_PAGE_TYPE; io++) { + cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].peak_lat = + jiffies_to_msecs(io_lat->peak_lat[idx][io]); + iostat_lat[idx][io].cnt = cnt; + iostat_lat[idx][io].avg_lat = cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0; + io_lat->sum_lat[idx][io] = 0; + io_lat->peak_lat[idx][io] = 0; + io_lat->bio_cnt[idx][io] = 0; + } + } + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); + + trace_f2fs_iostat_latency(sbi, iostat_lat); +} + +static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) +{ + unsigned long long iostat_diff[NR_IO_TYPE]; + int i; + unsigned long flags; + + if (time_is_after_jiffies(sbi->iostat_next_period)) + return; + + /* Need double check under the lock */ + spin_lock_irqsave(&sbi->iostat_lock, flags); + if (time_is_after_jiffies(sbi->iostat_next_period)) { + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + return; + } + sbi->iostat_next_period = jiffies + + msecs_to_jiffies(sbi->iostat_period_ms); + + for (i = 0; i < NR_IO_TYPE; i++) { + iostat_diff[i] = sbi->rw_iostat[i] - + sbi->prev_rw_iostat[i]; + sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; + } + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + + trace_f2fs_iostat(sbi, iostat_diff); + + __record_iostat_latency(sbi); +} + +void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int i; + + spin_lock_irq(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) { + sbi->rw_iostat[i] = 0; + sbi->prev_rw_iostat[i] = 0; + } + spin_unlock_irq(&sbi->iostat_lock); + + spin_lock_irq(&sbi->iostat_lat_lock); + memset(io_lat, 0, sizeof(struct iostat_lat_info)); + spin_unlock_irq(&sbi->iostat_lat_lock); +} + +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes) +{ + unsigned long flags; + + if (!sbi->iostat_enable) + return; + + spin_lock_irqsave(&sbi->iostat_lock, flags); + sbi->rw_iostat[type] += io_bytes; + + if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_WRITE_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_READ_IO] += io_bytes; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_READ_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + + if (type == FS_DATA_READ_IO) + sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + + if (type == FS_DATA_IO) + sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + } +#endif + + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + + f2fs_record_iostat(sbi); +} + +static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, + int rw, bool is_sync) +{ + unsigned long ts_diff; + unsigned int iotype = iostat_ctx->type; + struct f2fs_sb_info *sbi = iostat_ctx->sbi; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int idx; + unsigned long flags; + + if (!sbi->iostat_enable) + return; + + ts_diff = jiffies - iostat_ctx->submit_ts; + if (iotype >= META_FLUSH) + iotype = META; + + if (rw == 0) { + idx = READ_IO; + } else { + if (is_sync) + idx = WRITE_SYNC_IO; + else + idx = WRITE_ASYNC_IO; + } + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + io_lat->sum_lat[idx][iotype] += ts_diff; + io_lat->bio_cnt[idx][iotype]++; + if (ts_diff > io_lat->peak_lat[idx][iotype]) + io_lat->peak_lat[idx][iotype] = ts_diff; + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); +} + +void iostat_update_and_unbind_ctx(struct bio *bio, int rw) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + bool is_sync = bio->bi_opf & REQ_SYNC; + + if (rw == 0) + bio->bi_private = iostat_ctx->post_read_ctx; + else + bio->bi_private = iostat_ctx->sbi; + __update_iostat_latency(iostat_ctx, rw, is_sync); + mempool_free(iostat_ctx, bio_iostat_ctx_pool); +} + +void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) +{ + struct bio_iostat_ctx *iostat_ctx; + /* Due to the mempool, this never fails. */ + iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS); + iostat_ctx->sbi = sbi; + iostat_ctx->submit_ts = 0; + iostat_ctx->type = 0; + iostat_ctx->post_read_ctx = ctx; + bio->bi_private = iostat_ctx; +} + +int __init f2fs_init_iostat_processing(void) +{ + bio_iostat_ctx_cache = + kmem_cache_create("f2fs_bio_iostat_ctx", + sizeof(struct bio_iostat_ctx), 0, 0, NULL); + if (!bio_iostat_ctx_cache) + goto fail; + bio_iostat_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS, + bio_iostat_ctx_cache); + if (!bio_iostat_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_iostat_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_iostat_processing(void) +{ + mempool_destroy(bio_iostat_ctx_pool); + kmem_cache_destroy(bio_iostat_ctx_cache); +} + +int f2fs_init_iostat(struct f2fs_sb_info *sbi) +{ + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + spin_lock_init(&sbi->iostat_lat_lock); + sbi->iostat_enable = false; + sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info), + GFP_KERNEL); + if (!sbi->iostat_io_lat) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) +{ + kfree(sbi->iostat_io_lat); +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h new file mode 100644 index 000000000000..2c048307b6e0 --- /dev/null +++ b/fs/f2fs/iostat.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ +#ifndef __F2FS_IOSTAT_H__ +#define __F2FS_IOSTAT_H__ + +struct bio_post_read_ctx; + +#ifdef CONFIG_F2FS_IOSTAT + +#define DEFAULT_IOSTAT_PERIOD_MS 3000 +#define MIN_IOSTAT_PERIOD_MS 100 +/* maximum period of iostat tracing is 1 day */ +#define MAX_IOSTAT_PERIOD_MS 8640000 + +enum { + READ_IO, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + +struct iostat_lat_info { + unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ + unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ + unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */ +}; + +extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset); +extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes); + +struct bio_iostat_ctx { + struct f2fs_sb_info *sbi; + unsigned long submit_ts; + enum page_type type; + struct bio_post_read_ctx *post_read_ctx; +}; + +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + iostat_ctx->submit_ts = jiffies; + iostat_ctx->type = type; +} + +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + return iostat_ctx->post_read_ctx; +} + +extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw); +extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx); +extern int f2fs_init_iostat_processing(void); +extern void f2fs_destroy_iostat_processing(void); +extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); +#else +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} +static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) {} +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) {} +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + return bio->bi_private; +} +static inline int f2fs_init_iostat_processing(void) { return 0; } +static inline void f2fs_destroy_iostat_processing(void) {} +static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {} +#endif +#endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2aa035422c0f..a389772fd212 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,12 +22,14 @@ #include "acl.h" #include <trace/events/f2fs.h> -static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, + struct inode *dir, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); nid_t ino; struct inode *inode; bool nid_free = false; + bool encrypt = false; int xattr_size = 0; int err; @@ -35,23 +37,20 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (!inode) return ERR_PTR(-ENOMEM); - f2fs_lock_op(sbi); if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - f2fs_unlock_op(sbi); nid_free = true; - inode_init_owner(inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; @@ -66,18 +65,20 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else - F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid = make_kprojid(mnt_userns, F2FS_DEF_PROJID); - err = dquot_initialize(inode); + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; + + err = f2fs_dquot_initialize(inode); if (err) goto fail_drop; set_inode_flag(inode, FI_NEW_INODE); - /* If the directory encrypted, then we should encrypt the inode. */ - if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && - f2fs_may_encrypt(inode)) + if (encrypt) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi)) { @@ -88,8 +89,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); @@ -106,10 +105,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_init_extent_tree(inode, NULL); - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -126,6 +121,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_compress_context(inode); } + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -150,7 +153,8 @@ fail_drop: return ERR_PTR(err); } -static inline int is_extension_exist(const unsigned char *s, const char *sub) +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -166,6 +170,13 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) if (slen < sublen + 2) return 0; + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + for (i = 1; i < slen - sublen; i++) { if (s[i] != '.') continue; @@ -177,7 +188,7 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) } /* - * Set multimedia files as cold files for hot/cold data separation + * Set file's temperature for hot/cold data separation */ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) @@ -185,17 +196,17 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; int i, cold_count, hot_count; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) + if (is_extension_exist(name, extlist[i], true)) break; } - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); if (i == cold_count + hot_count) return; @@ -276,43 +287,56 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*ext)[F2FS_EXTENSION_LEN]; - unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; int i, cold_count, hot_count; if (!f2fs_sb_has_compression(sbi) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE) || F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode)) + !f2fs_may_compress(inode) || + (!ext_cnt && !noext_cnt)) return; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) { - up_read(&sbi->sb_lock); + if (is_extension_exist(name, extlist[i], false)) { + f2fs_up_read(&sbi->sb_lock); return; } } - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); - ext = F2FS_OPTION(sbi).extensions; + for (i = 0; i < noext_cnt; i++) { + if (is_extension_exist(name, noext[i], false)) { + f2fs_disable_compressed_file(inode); + return; + } + } + + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return; for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i])) + if (!is_extension_exist(name, ext[i], false)) continue; + /* Do not use inline_data with compression */ + stat_dec_inline_inode(inode); + clear_inode_flag(inode, FI_INLINE_DATA); set_compress_context(inode); return; } } -static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -324,11 +348,11 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -383,7 +407,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, F2FS_I(old_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -413,9 +437,9 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { - struct qstr dotdot = QSTR_INIT("..", 2); struct page *page; - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page); + if (!ino) { if (IS_ERR(page)) return ERR_CAST(page); @@ -439,7 +463,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } - err = dquot_initialize(dir); + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -484,7 +515,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, nid_t ino = -1; int err = 0; unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); - struct fscrypt_name fname; + struct f2fs_filename fname; trace_f2fs_lookup_start(dir, dentry, flags); @@ -493,19 +524,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - err = fscrypt_prepare_lookup(dir, dentry, &fname); + err = f2fs_prepare_lookup(dir, dentry, &fname); + generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) goto out_splice; if (err) goto out; de = __f2fs_find_entry(dir, &fname, &page); - fscrypt_free_filename(&fname); + f2fs_free_filename(&fname); if (!de) { if (IS_ERR(page)) { err = PTR_ERR(page); goto out; } + err = -ENOENT; goto out_splice; } @@ -538,7 +571,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_iput; } out_splice: -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) if (!inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as @@ -551,7 +584,7 @@ out_splice: #endif new = d_splice_alias(inode, dentry); err = PTR_ERR_OR_ZERO(new); - trace_f2fs_lookup_end(dir, dentry, ino, err); + trace_f2fs_lookup_end(dir, dentry, ino, !new ? -ENOENT : err); return new; out_iput: iput(inode); @@ -566,19 +599,21 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; struct page *page; - int err = -ENOENT; + int err; trace_f2fs_unlink_enter(dir, dentry); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto fail; + } - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) - return err; - err = dquot_initialize(inode); + goto fail; + err = f2fs_dquot_initialize(inode); if (err) - return err; + goto fail; de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { @@ -597,11 +632,11 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the - * negative dentries at f2fs_lookup(), when it is better + * negative dentries at f2fs_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_CASEFOLDED(dir)) @@ -621,6 +656,7 @@ static const char *f2fs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *link = page_get_link(dentry, inode, done); + if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ do_delayed_call(done); @@ -630,8 +666,8 @@ static const char *f2fs_get_link(struct dentry *dentry, return link; } -static int f2fs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -649,11 +685,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -706,11 +742,12 @@ out_f2fs_handle_failed_inode: f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) - kvfree(disk_link.name); + kfree(disk_link.name); return err; } -static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -719,18 +756,18 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; - inode = f2fs_new_inode(dir, S_IFDIR | mode); + inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); @@ -758,13 +795,14 @@ out_fail: static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; } -static int f2fs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -775,11 +813,11 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -806,22 +844,23 @@ out: return err; } -static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(mnt_userns, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -846,17 +885,25 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; - *whiteout = inode; + spin_unlock(&inode->i_lock); } else { - d_tmpfile(dentry, inode); + if (file) + d_tmpfile(file, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -867,35 +914,41 @@ out: return err; } -static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { - int err = fscrypt_get_encryption_info(dir); - if (err) - return err; - } + err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); - return __f2fs_tmpfile(dir, dentry, mode, NULL); + return finish_open_simple(file, err); } -static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) +static int f2fs_create_whiteout(struct user_namespace *mnt_userns, + struct inode *dir, struct inode **whiteout) { if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) return -EIO; - return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); + return __f2fs_tmpfile(mnt_userns, dir, NULL, + S_IFCHR | WHITEOUT_MODE, true, whiteout); } -static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); +} + +static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = d_inode(old_dentry); @@ -933,21 +986,21 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(old_dir, &whiteout); + err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout); if (err) return err; } - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; if (new_inode) { - err = dquot_initialize(new_inode); + err = f2fs_dquot_initialize(new_inode); if (err) goto out; } @@ -996,11 +1049,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_page = NULL; new_inode->i_ctime = current_time(new_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); f2fs_i_links_write(new_inode, false); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); if (!new_inode->i_nlink) f2fs_add_orphan_inode(new_inode); @@ -1021,13 +1074,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, true); } - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry || whiteout) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); @@ -1040,7 +1093,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } @@ -1076,8 +1133,7 @@ out_dir: out_old: f2fs_put_page(old_page, 0); out: - if (whiteout) - iput(whiteout); + iput(whiteout); return err; } @@ -1107,11 +1163,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, F2FS_I(new_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; @@ -1183,38 +1239,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, /* update directory entry info of old dir inode */ f2fs_set_link(old_dir, old_entry, old_page, new_inode); - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_dir->i_ctime = current_time(old_dir); if (old_nlink) { - down_write(&F2FS_I(old_dir)->i_sem); + f2fs_down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); - up_write(&F2FS_I(old_dir)->i_sem); + f2fs_up_write(&F2FS_I(old_dir)->i_sem); } f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (!new_dir_entry) file_lost_pino(new_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(new_inode, old_dir->i_ino); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); new_dir->i_ctime = current_time(new_dir); if (new_nlink) { - down_write(&F2FS_I(new_dir)->i_sem); + f2fs_down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); - up_write(&F2FS_I(new_dir)->i_sem); + f2fs_up_write(&F2FS_I(new_dir)->i_sem); } f2fs_mark_inode_dirty_sync(new_dir, false); @@ -1246,7 +1302,8 @@ out: return err; } -static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int f2fs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1268,7 +1325,8 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return f2fs_rename(mnt_userns, old_dir, old_dentry, + new_dir, new_dentry, flags); } static const char *f2fs_encrypted_get_link(struct dentry *dentry, @@ -1291,13 +1349,21 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, return target; } +static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + const struct inode_operations f2fs_encrypted_symlink_inode_operations = { - .get_link = f2fs_encrypted_get_link, - .getattr = f2fs_getattr, + .get_link = f2fs_encrypted_get_link, + .getattr = f2fs_encrypted_symlink_getattr, .setattr = f2fs_setattr, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif }; const struct inode_operations f2fs_dir_inode_operations = { @@ -1315,27 +1381,23 @@ const struct inode_operations f2fs_dir_inode_operations = { .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, }; const struct inode_operations f2fs_symlink_inode_operations = { - .get_link = f2fs_get_link, + .get_link = f2fs_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif }; const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, - .setattr = f2fs_setattr, + .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, -#ifdef CONFIG_F2FS_FS_XATTR .listxattr = f2fs_listxattr, -#endif }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9d02cdcdbb07..983572f23896 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -8,7 +8,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/mpage.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/swap.h> @@ -17,7 +17,7 @@ #include "node.h" #include "segment.h" #include "xattr.h" -#include "trace.h" +#include "iostat.h" #include <trace/events/f2fs.h> #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } return 0; @@ -44,11 +45,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct sysinfo val; unsigned long avail_ram; unsigned long mem_size = 0; bool res = false; + if (!nm_i) + return true; + si_meminfo(&val); /* only uses low memory */ @@ -62,8 +67,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { - mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_SHIFT; + mem_size = (nm_i->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); if (excess_cached_nats(sbi)) res = false; @@ -86,10 +91,24 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); + } else if (type == DISCARD_CACHE) { + mem_size = (atomic_read(&dcc->discard_cmd_cnt) * + sizeof(struct discard_cmd)) >> PAGE_SHIFT; + res = mem_size < (avail_ram * nm_i->ram_thresh / 100); + } else if (type == COMPRESS_PAGE) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned long free_ram = val.freeram; + + /* + * free memory is lower than watermark or cached page count + * exceed threshold, deny caching compress page. + */ + res = (free_ram > avail_ram * sbi->compress_watermark / 100) && + (COMPRESS_MAPPING(sbi)->nrpages < + free_ram * sbi->compress_percent / 100); +#else + res = false; +#endif } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; @@ -109,7 +128,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); + return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -141,14 +160,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, + nid_t nid, bool no_fail) { struct nat_entry *new; - if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); - else - new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_F2FS_ZERO, no_fail, sbi); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); @@ -177,7 +195,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock); - nm_i->nat_cnt++; + nm_i->nat_cnt[TOTAL_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; return ne; } @@ -207,7 +226,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); - nm_i->nat_cnt--; + nm_i->nat_cnt[TOTAL_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; __free_nat_entry(e); } @@ -219,7 +239,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, + GFP_NOFS, true, NULL); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -253,7 +274,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; - nm_i->dirty_nat_cnt++; + nm_i->nat_cnt[DIRTY_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -273,7 +295,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; - nm_i->dirty_nat_cnt--; + nm_i->nat_cnt[DIRTY_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -304,7 +327,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, unsigned long flags; unsigned int seq_id; - fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, + GFP_NOFS, true, NULL); get_page(page); fn->page = page; @@ -355,14 +379,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need; } @@ -372,11 +396,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -386,13 +410,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need_update; } @@ -403,11 +427,15 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; - new = __alloc_nat_entry(nid, false); + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (f2fs_rwsem_is_locked(&sbi->cp_global_sem)) + return; + + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) e = __init_nat_entry(nm_i, new, ne, false); @@ -416,7 +444,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); if (e != new) __free_nat_entry(new); } @@ -426,9 +454,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - struct nat_entry *new = __alloc_nat_entry(ni->nid, true); + struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = __init_nat_entry(nm_i, new, NULL, true); @@ -459,6 +487,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); } @@ -476,7 +505,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -484,7 +513,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - if (!down_write_trylock(&nm_i->nat_tree_lock)) + if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock)) return 0; spin_lock(&nm_i->nat_list_lock); @@ -506,15 +535,12 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&nm_i->nat_list_lock); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } -/* - * This function always returns success - */ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni) + struct node_info *ni, bool checkpoint_context) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -529,22 +555,32 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, int i; ni->nid = nid; - +retry: /* Check nat cache */ - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return 0; } - memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + /* + * Check current segment summary by trying to grab journal_rwsem first. + * This sem is on the critical path on the checkpoint requiring the above + * nat_tree_lock. Therefore, we should retry, if we failed to grab here + * while not bothering checkpoint. + */ + if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { + down_read(&curseg->journal_rwsem); + } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { + f2fs_up_read(&nm_i->nat_tree_lock); + goto retry; + } - /* Check current segment summary */ - down_read(&curseg->journal_rwsem); i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); @@ -552,13 +588,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, } up_read(&curseg->journal_rwsem); if (i >= 0) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); goto cache; } /* Fill node_info from nat page */ index = current_nat_addr(sbi, nid); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); page = f2fs_get_meta_page(sbi, index); if (IS_ERR(page)) @@ -621,10 +657,10 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) switch (dn->max_level) { case 3: base += 2 * indirect_blks; - /* fall through */ + fallthrough; case 2: base += 2 * direct_blks; - /* fall through */ + fallthrough; case 1: base += direct_index; break; @@ -716,8 +752,7 @@ got: /* * Caller should call f2fs_put_dnode(dn). * Also, it should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op() only if ro is not set RDONLY_NODE. - * In the case of RDONLY_NODE, we don't need to care about mutex. + * f2fs_unlock_op() only if mode is set with ALLOC_NODE. */ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { @@ -809,8 +844,27 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); + + if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && + f2fs_sb_has_readonly(sbi)) { + unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn); + block_t blkaddr; + + if (!c_len) + goto out; + + blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == COMPRESS_ADDR) + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + 1); + + f2fs_update_extent_tree_range_compressed(dn->inode, + index, blkaddr, + F2FS_I(dn->inode)->i_cluster_size, + c_len); + } +out: return 0; release_pages: @@ -835,7 +889,7 @@ static int truncate_node(struct dnode_of_data *dn) int err; pgoff_t index; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1046,8 +1100,10 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); - if (level < 0) + if (level < 0) { + trace_f2fs_truncate_inode_blocks_exit(inode, level); return level; + } page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -1188,8 +1244,9 @@ int f2fs_remove_inode_page(struct inode *inode) } if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { - f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu", - inode->i_ino, (unsigned long long)inode->i_blocks); + f2fs_warn(F2FS_I_SB(inode), + "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); } @@ -1231,12 +1288,17 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); if (err) { dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; } - f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); + if (unlikely(new_ni.blk_addr != NULL_ADDR)) { + err = -EFSCORRUPTED; + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + goto fail; + } #endif new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; @@ -1271,7 +1333,7 @@ fail: * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int op_flags) +static int read_node_page(struct page *page, blk_opf_t op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; @@ -1293,18 +1355,25 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - err = f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni, false); if (err) return err; - if (unlikely(ni.blk_addr == NULL_ADDR) || + /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; } fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; - return f2fs_submit_page_bio(&fio); + + err = f2fs_submit_page_bio(&fio); + + if (!err) + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); + + return err; } /* @@ -1349,8 +1418,7 @@ repeat: err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1376,18 +1444,23 @@ repeat: goto out_err; } page_hit: - if(unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + /* ENOENT comes from read_node_page which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1518,8 +1591,12 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; + } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; @@ -1533,21 +1610,21 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); - if (f2fs_get_node_info(sbi, nid, &ni)) + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) + if (!f2fs_down_read_trylock(&sbi->node_write)) goto redirty_out; } else { - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); } /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); unlock_page(page); return 0; } @@ -1555,27 +1632,28 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (__is_valid_data_blkaddr(ni.blk_addr) && !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; - set_page_writeback(page); - ClearPageError(page); - + /* should add to global list before clearing PAGECACHE status */ if (f2fs_in_warm_node_list(sbi, page)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; } + set_page_writeback(page); + ClearPageError(page); + fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); @@ -1709,6 +1787,7 @@ continue_unlock: if (!atomic || page == last_page) { set_fsync_mark(page, 1); + percpu_counter_inc(&sbi->rf_node_block_count); if (IS_INODE(page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) @@ -1716,7 +1795,7 @@ continue_unlock: set_dentry_mark(page, f2fs_need_dentry_mark(sbi, ino)); } - /* may be written by other thread */ + /* may be written by other thread */ if (!PageDirty(page)) set_page_dirty(page); } @@ -1760,7 +1839,7 @@ continue_unlock: out: if (nwritten) f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); - return ret ? -EIO: 0; + return ret ? -EIO : 0; } static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) @@ -1804,6 +1883,51 @@ static bool flush_dirty_inode(struct page *page) return true; } +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) +{ + pgoff_t index = 0; + struct pagevec pvec; + int nr_pages; + + pagevec_init(&pvec); + + while ((nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (!IS_DNODE(page)) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + /* flush inline_data, if it's async context. */ + if (page_private_inline(page)) { + clear_page_private_inline(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + continue; + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) @@ -1827,7 +1951,6 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; - bool may_dirty = true; /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[NODE]) && @@ -1867,21 +1990,22 @@ continue_unlock: goto continue_unlock; } + /* flush inline_data/inode, if it's async context. */ + if (!do_balance) + goto write_node; + /* flush inline_data */ - if (is_inline_node(page)) { - clear_inline_node(page); + if (page_private_inline(page)) { + clear_page_private_inline(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(page) && may_dirty) { - may_dirty = false; - if (flush_dirty_inode(page)) - goto lock_node; - } - + if (IS_INODE(page) && flush_dirty_inode(page)) + goto lock_node; +write_node: f2fs_wait_on_page_writeback(page, NODE, true, true); if (!clear_page_dirty_for_io(page)) @@ -1979,7 +2103,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, goto skip_write; /* balancing f2fs's metadata in background */ - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, true); /* collect a number of dirty node pages and write together */ if (wbc->sync_mode != WB_SYNC_ALL && @@ -1989,8 +2113,12 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[NODE]); - else if (atomic_read(&sbi->wb_sync_req[NODE])) + else if (atomic_read(&sbi->wb_sync_req[NODE])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } trace_f2fs_writepages(mapping->host, wbc, NODE); @@ -2010,24 +2138,23 @@ skip_write: return 0; } -static int f2fs_set_node_page_dirty(struct page *page) +static bool f2fs_dirty_node_folio(struct address_space *mapping, + struct folio *folio) { - trace_f2fs_set_page_dirty(page, NODE); + trace_f2fs_set_page_dirty(&folio->page, NODE); - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); #ifdef CONFIG_F2FS_CHECK_FS - if (IS_INODE(page)) - f2fs_inode_chksum_set(F2FS_P_SB(page), page); + if (IS_INODE(&folio->page)) + f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); - return 1; + if (filemap_dirty_folio(mapping, folio)) { + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + set_page_private_reference(&folio->page); + return true; } - return 0; + return false; } /* @@ -2036,12 +2163,10 @@ static int f2fs_set_node_page_dirty(struct page *page) const struct address_space_operations f2fs_node_aops = { .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, - .set_page_dirty = f2fs_set_node_page_dirty, - .invalidatepage = f2fs_invalidate_page, - .releasepage = f2fs_release_page, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .dirty_folio = f2fs_dirty_node_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .migrate_folio = filemap_migrate_folio, }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -2051,18 +2176,16 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, } static int __insert_free_nid(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_state state) + struct free_nid *i) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) return err; - f2fs_bug_on(sbi, state != i->state); - nm_i->nid_cnt[state]++; - if (state == FREE_NID) - list_add_tail(&i->list, &nm_i->free_nid_list); + nm_i->nid_cnt[FREE_NID]++; + list_add_tail(&i->list, &nm_i->free_nid_list); return 0; } @@ -2100,6 +2223,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i; + bool ret = true; + + f2fs_down_read(&nm_i->nat_tree_lock); + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) { + ret = false; + break; + } + } + f2fs_up_read(&nm_i->nat_tree_lock); + + return ret; +} + static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2141,7 +2282,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, if (unlikely(f2fs_check_nid_range(sbi, nid))) return false; - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL); i->nid = nid; i->state = FREE_NID; @@ -2184,7 +2325,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, } } ret = true; - err = __insert_free_nid(sbi, i, FREE_NID); + err = __insert_free_nid(sbi, i); err_out: if (update) { update_free_nid_bitmap(sbi, nid, ret, build); @@ -2278,7 +2419,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) unsigned int i, idx; nid_t nid; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) @@ -2301,7 +2442,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) out: scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, @@ -2314,6 +2455,9 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, if (unlikely(nid >= nm_i->max_nid)) nid = 0; + if (unlikely(nid % NAT_ENTRY_PER_BLOCK)) + nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return 0; @@ -2333,7 +2477,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); while (1) { if (!test_bit_le(NAT_BLOCK_OFFSET(nid), @@ -2348,7 +2492,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, } if (ret) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); return ret; } @@ -2368,7 +2512,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, /* find free nids from current sum_pages */ scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -2485,7 +2629,6 @@ void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i, *next; int nr = nr_shrink; if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) @@ -2494,23 +2637,29 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || - nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) - break; + while (nr_shrink && nm_i->nid_cnt[FREE_NID] > MAX_FREE_NIDS) { + struct free_nid *i, *next; + unsigned int batch = SHRINK_NID_BATCH_SIZE; - __remove_free_nid(sbi, i, FREE_NID); - kmem_cache_free(free_nid_slab, i); - nr_shrink--; + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (!nr_shrink || !batch || + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + break; + __remove_free_nid(sbi, i, FREE_NID); + kmem_cache_free(free_nid_slab, i); + nr_shrink--; + batch--; + } + spin_unlock(&nm_i->nid_list_lock); } - spin_unlock(&nm_i->nid_list_lock); + mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; } -void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; size_t inline_size; @@ -2518,13 +2667,20 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) struct f2fs_inode *ri; ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); - f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); ri = F2FS_INODE(page); if (ri->i_inline & F2FS_INLINE_XATTR) { - set_inode_flag(inode, FI_INLINE_XATTR); + if (!f2fs_has_inline_xattr(inode)) { + set_inode_flag(inode, FI_INLINE_XATTR); + stat_inc_inline_xattr(inode); + } } else { - clear_inode_flag(inode, FI_INLINE_XATTR); + if (f2fs_has_inline_xattr(inode)) { + stat_dec_inline_xattr(inode); + clear_inode_flag(inode, FI_INLINE_XATTR); + } goto update_inode; } @@ -2537,6 +2693,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) update_inode: f2fs_update_inode(inode, ipage); f2fs_put_page(ipage, 1); + return 0; } int f2fs_recover_xattr_data(struct inode *inode, struct page *page) @@ -2553,7 +2710,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - err = f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); if (err) return err; @@ -2593,7 +2750,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct page *ipage; int err; - err = f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni, false); if (err) return err; @@ -2602,7 +2759,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_NOFS); goto retry; } @@ -2617,7 +2774,7 @@ retry: src = F2FS_INODE(page); dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); @@ -2670,7 +2827,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, BIO_MAX_PAGES); + nrpages = bio_max_segs(last_offset - i); /* readahead node pages */ f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); @@ -2708,11 +2865,14 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct f2fs_nat_entry raw_ne; nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); + if (f2fs_check_nid_range(sbi, nid)) + continue; + raw_ne = nat_in_journal(journal, i); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = __alloc_nat_entry(nid, true); + ne = __alloc_nat_entry(sbi, nid, true); __init_nat_entry(nm_i, ne, &raw_ne, true); } @@ -2752,7 +2912,23 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, + unsigned int valid) +{ + if (valid == 0) { + __set_bit_le(nat_ofs, nm_i->empty_nat_bits); + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_ofs, nm_i->full_nat_bits); + else + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); +} + +static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2761,7 +2937,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; if (nat_index == 0) { @@ -2772,17 +2948,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - if (valid == 0) { - __set_bit_le(nat_index, nm_i->empty_nat_bits); - __clear_bit_le(nat_index, nm_i->full_nat_bits); - return; + + __update_nat_bits(nm_i, nat_index, valid); +} + +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs; + + f2fs_down_read(&nm_i->nat_tree_lock); + + for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { + unsigned int valid = 0, nid_ofs = 0; + + /* handle nid zero due to it should never be used */ + if (unlikely(nat_ofs == 0)) { + valid = 1; + nid_ofs = 1; + } + + for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { + if (!test_bit_le(nid_ofs, + nm_i->free_nid_bitmap[nat_ofs])) + valid++; + } + + __update_nat_bits(nm_i, nat_ofs, valid); } - __clear_bit_le(nat_index, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_index, nm_i->full_nat_bits); - else - __clear_bit_le(nat_index, nm_i->full_nat_bits); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -2801,7 +2996,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (enabled_nat_bits(sbi, cpc) || + if ((cpc->reason & CP_UMOUNT) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -2848,7 +3043,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); + update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -2875,30 +3070,35 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) LIST_HEAD(sets); int err = 0; - /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ - if (enabled_nat_bits(sbi, cpc)) { - down_write(&nm_i->nat_tree_lock); + /* + * during unmount, let's flush nat_bits before checking + * nat_cnt[DIRTY_NAT]. + */ + if (cpc->reason & CP_UMOUNT) { + f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } - if (!nm_i->dirty_nat_cnt) + if (!nm_i->nat_cnt[DIRTY_NAT]) return 0; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them * into nat entry set. */ - if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (cpc->reason & CP_UMOUNT || + !__has_cursum_space(journal, + nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; + set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, @@ -2912,7 +3112,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); /* Allow dirty nats by node block allocation in write_begin */ return err; @@ -2927,15 +3127,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; - if (!enabled_nat_bits(sbi, NULL)) - return 0; - nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); - nm_i->nat_bits = f2fs_kzalloc(sbi, + nm_i->nat_bits = f2fs_kvzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return 0; + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -2952,13 +3155,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - disable_nat_bits(sbi, true); + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", + cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); return 0; } - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -2969,7 +3171,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; for (i = 0; i < nm_i->nat_blocks; i++) { @@ -3017,10 +3219,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; - nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; + nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); @@ -3031,14 +3233,11 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->nid_list_lock); - init_rwsem(&nm_i->nat_tree_lock); + init_f2fs_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); - if (!version_bitmap) - return -EFAULT; - nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); if (!nm_i->nat_bitmap) @@ -3064,9 +3263,9 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) int i; nm_i->free_nid_bitmap = - f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *), - nm_i->nat_blocks), - GFP_KERNEL); + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; @@ -3140,7 +3339,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -3154,7 +3353,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) __del_from_nat_cache(nm_i, natvec[idx]); } } - f2fs_bug_on(sbi, nm_i->nat_cnt); + f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]); /* destroy nat set cache */ nid = 0; @@ -3170,7 +3369,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); if (nm_i->free_nid_bitmap) { @@ -3188,27 +3387,27 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kvfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; - kvfree(nm_i); + kfree(nm_i); } int __init f2fs_create_node_manager_caches(void) { - nat_entry_slab = f2fs_kmem_cache_create("nat_entry", + nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry", sizeof(struct nat_entry)); if (!nat_entry_slab) goto fail; - free_nid_slab = f2fs_kmem_cache_create("free_nid", + free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid", sizeof(struct free_nid)); if (!free_nid_slab) goto destroy_nat_entry; - nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set", sizeof(struct nat_entry_set)); if (!nat_entry_set_slab) goto destroy_free_nid; - fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry", + fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry", sizeof(struct fsync_node_entry)); if (!fsync_node_entry_slab) goto destroy_nat_entry_set; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e05af5df5648..3c09cae058b0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/node.h * @@ -15,6 +15,9 @@ #define FREE_NID_PAGES 8 #define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) +/* size of free nid batch when shrinking */ +#define SHRINK_NID_BATCH_SIZE 8 + #define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ @@ -28,6 +31,9 @@ /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 +/* control total # of node writes used for roll-fowrad recovery */ +#define DEF_RF_NODE_BLOCKS 0 + /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 #define SETVEC_SIZE 32 @@ -35,6 +41,9 @@ /* return value for read_node_page */ #define LOCKED_PAGE 1 +/* check pinned file's alignment status of physical blocks */ +#define FILE_NOT_ALIGNED 1 + /* For flag in struct node_info */ enum { IS_CHECKPOINTED, /* is it checkpointed before? */ @@ -123,18 +132,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * NM_I(sbi)->dirty_nats_ratio / 100; } static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; -} - -static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) -{ - return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; + return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } enum mem_type { @@ -143,7 +147,8 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ + DISCARD_CACHE, /* indicates memory of cached discard cmds */ + COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ }; @@ -385,20 +390,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_cold_data(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_cold_data(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_cold_data(struct page *page) -{ - ClearPageChecked(page); -} static inline int is_node(struct page *page, int type) { @@ -410,21 +401,6 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) -static inline int is_inline_node(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_inline_node(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_inline_node(struct page *page) -{ - ClearPageChecked(page); -} - static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 763d5c0951d1..dea95b48b647 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -5,8 +5,10 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" @@ -44,12 +46,20 @@ static struct kmem_cache *fsync_entry_slab; +#if IS_ENABLED(CONFIG_UNICODE) +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; + if (NM_I(sbi)->max_rf_node_blocks && + percpu_counter_sum_positive(&sbi->rf_node_block_count) >= + NM_I(sbi)->max_rf_node_blocks) + return false; return true; } @@ -76,7 +86,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, if (IS_ERR(inode)) return ERR_CAST(inode); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto err_out; @@ -86,7 +96,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, goto err_out; } - entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, + GFP_F2FS_ZERO, true, NULL); entry->inode = inode; list_add_tail(&entry->list, head); @@ -107,13 +118,60 @@ static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) kmem_cache_free(fsync_entry_slab, entry); } +static int init_recovered_filename(const struct inode *dir, + struct f2fs_inode *raw_inode, + struct f2fs_filename *fname, + struct qstr *usr_fname) +{ + int err; + + memset(fname, 0, sizeof(*fname)); + fname->disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname->disk_name.name = raw_inode->i_name; + + if (WARN_ON(fname->disk_name.len > F2FS_NAME_LEN)) + return -ENAMETOOLONG; + + if (!IS_ENCRYPTED(dir)) { + usr_fname->name = fname->disk_name.name; + usr_fname->len = fname->disk_name.len; + fname->usr_fname = usr_fname; + } + + /* Compute the hash of the filename */ + if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) { + /* + * In this case the hash isn't computable without the key, so it + * was saved on-disk. + */ + if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN) + return -EINVAL; + fname->hash = get_unaligned((f2fs_hash_t *) + &raw_inode->i_name[fname->disk_name.len]); + } else if (IS_CASEFOLDED(dir)) { + err = f2fs_init_casefolded_name(dir, fname); + if (err) + return err; + f2fs_hash_filename(dir, fname); +#if IS_ENABLED(CONFIG_UNICODE) + /* Case-sensitive match is fine for recovery */ + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; +#endif + } else { + f2fs_hash_filename(dir, fname); + } + return 0; +} + static int recover_dentry(struct inode *inode, struct page *ipage, struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct fscrypt_name fname; + struct f2fs_filename fname; + struct qstr usr_fname; struct page *page; struct inode *dir, *einode; struct fsync_inode_entry *entry; @@ -132,16 +190,9 @@ static int recover_dentry(struct inode *inode, struct page *ipage, } dir = entry->inode; - - memset(&fname, 0, sizeof(struct fscrypt_name)); - fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); - fname.disk_name.name = raw_inode->i_name; - - if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { - WARN_ON(1); - err = -ENAMETOOLONG; + err = init_recovered_filename(dir, raw_inode, &fname, &usr_fname); + if (err) goto out; - } retry: de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) @@ -157,7 +208,7 @@ retry: goto out_put; } - err = dquot_initialize(einode); + err = f2fs_dquot_initialize(einode); if (err) { iput(einode); goto out_put; @@ -204,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page) memset(&attr, 0, sizeof(attr)); - attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); - attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!uid_eq(attr.ia_uid, inode->i_uid)) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) attr.ia_valid |= ATTR_UID; - if (!gid_eq(attr.ia_gid, inode->i_gid)) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(inode, &attr); + err = dquot_transfer(&init_user_ns, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; @@ -296,6 +347,19 @@ static int recover_inode(struct inode *inode, struct page *page) return 0; } +static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, + unsigned int ra_blocks, unsigned int blkaddr, + unsigned int next_blkaddr) +{ + if (blkaddr + 1 == next_blkaddr) + ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, + ra_blocks * 2); + else if (next_blkaddr % sbi->blocks_per_seg) + ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, + ra_blocks / 2); + return ra_blocks; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only) { @@ -303,6 +367,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct page *page = NULL; block_t blkaddr; unsigned int loop_cnt = 0; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - valid_user_blocks(sbi); int err = 0; @@ -377,11 +442,14 @@ next: break; } + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - f2fs_ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } return err; } @@ -406,7 +474,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; - unsigned int offset; + unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; @@ -417,6 +485,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, /* Get the previous summary */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; goto got_it; @@ -432,15 +501,25 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); + return -EFSCORRUPTED; + } + if (dn->inode->i_ino == nid) { tdn.nid = nid; if (!dn->inode_page_locked) lock_page(dn->inode_page); tdn.node_page = dn->inode_page; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } @@ -461,7 +540,7 @@ got_it: if (IS_ERR(inode)) return PTR_ERR(inode); - ret = dquot_initialize(inode); + ret = f2fs_dquot_initialize(inode); if (ret) { iput(inode); return ret; @@ -496,8 +575,7 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.inode, tdn.node_page, - tdn.ofs_in_node) == blkaddr) + if (f2fs_data_blkaddr(&tdn) == blkaddr) f2fs_truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -514,7 +592,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* step 1: recover xattr */ if (IS_INODE(page)) { - f2fs_recover_inline_xattr(inode, page); + err = f2fs_recover_inline_xattr(inode, page); + if (err) + goto out; } else if (f2fs_has_xattr_block(ofs_of_node(page))) { err = f2fs_recover_xattr_data(inode, page); if (!err) @@ -523,8 +603,12 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* step 2: recover inline data */ - if (f2fs_recover_inline_data(inode, page)) + err = f2fs_recover_inline_data(inode, page); + if (err) { + if (err == 1) + err = 0; goto out; + } /* step 3: recover data indices */ start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); @@ -535,7 +619,7 @@ retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_NOFS); goto retry_dn; } goto out; @@ -543,7 +627,7 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; @@ -554,24 +638,27 @@ retry_dn: inode->i_ino, ofs_of_node(dn.node_page), ofs_of_node(page)); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - dest = datablock_addr(dn.inode, page, dn.ofs_in_node); + src = f2fs_data_blkaddr(&dn); + dest = data_blkaddr(dn.inode, page, dn.ofs_in_node); if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -618,12 +705,22 @@ retry_prev: err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_NOFS); goto retry_prev; } goto err; } + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto err; + } + /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, ni.version, false, false); @@ -651,6 +748,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct page *page = NULL; int err = 0; block_t blkaddr; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -662,8 +760,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; - f2fs_ra_meta_pages_cond(sbi, blkaddr); - page = f2fs_get_tmp_page(sbi, blkaddr); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -706,12 +802,17 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); next: + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } if (!err) - f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE); + f2fs_allocate_new_segments(sbi); return err; } @@ -734,25 +835,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif - fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) { - err = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ - mutex_lock(&sbi->cp_mutex); + f2fs_down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); @@ -770,10 +862,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); - else { - /* restore s_flags to let iput() trash data */ - sbi->sb->s_flags = s_flags; - } + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: fix_curseg_write_pointer = !check_only || list_empty(&inode_list); @@ -803,7 +893,7 @@ skip: if (!err) clear_sbi_flag(sbi, SBI_POR_DOING); - mutex_unlock(&sbi->cp_mutex); + f2fs_up_write(&sbi->cp_global_sem); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); @@ -819,8 +909,6 @@ skip: } } - kmem_cache_destroy(fsync_entry_slab); -out: #ifdef CONFIG_QUOTA /* Turn quotas off */ if (quota_enabled) @@ -828,5 +916,19 @@ out: #endif sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ - return ret ? ret: err; + return ret ? ret : err; +} + +int __init f2fs_create_recovery_cache(void) +{ + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_recovery_cache(void) +{ + kmem_cache_destroy(fsync_entry_slab); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cf0eb002cfd4..acf3d3fa4363 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -9,18 +9,20 @@ #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include <linux/prefetch.h> #include <linux/kthread.h> #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/sched/signal.h> +#include <linux/random.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include "gc.h" -#include "trace.h" +#include "iostat.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -28,7 +30,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -172,9 +174,9 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - if (test_opt(sbi, LFS)) + if (f2fs_lfs_mode(sbi)) return false; - if (sbi->gc_mode == GC_URGENT) + if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; @@ -183,300 +185,183 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct inmem_pages *new; - - f2fs_trace_pid(page); - - f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - - new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); - - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); + struct f2fs_inode_info *fi = F2FS_I(inode); - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); + if (!f2fs_is_atomic_file(inode)) + return; - trace_f2fs_register_inmem_page(page, INMEM); + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_COW_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_inode(inode); } -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; - - list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; + struct dnode_of_data dn; + struct node_info ni; + int err; - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - cond_resched(); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_cold_data(page); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; } - f2fs_clear_page_private(page); - f2fs_put_page(page, 1); - - list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + return err; } - return err; -} -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - congestion_wait(BLK_RW_ASYNC, HZ/50); - cond_resched(); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); + } else { + blkcnt_t count = 1; - while (!list_empty(&fi->inmem_pages)) { - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); } - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + f2fs_put_dnode(&dn); + return 0; } -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) { - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - - f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + struct revoke_entry *cur, *tmp; - mutex_lock(&fi->inmem_lock); - list_for_each_entry(cur, head, list) { - if (cur->page == page) - break; + list_for_each_entry_safe(cur, tmp, head, list) { + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); + list_del(&cur->list); + kmem_cache_free(revoke_entry_slab, cur); } - - f2fs_bug_on(sbi, list_empty(head) || cur->page != page); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - f2fs_clear_page_private(page); - f2fs_put_page(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __f2fs_commit_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; + } - f2fs_wait_on_page_writeback(page, DATA, true, true); + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto out; } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - cond_resched(); - goto retry; - } - unlock_page(page); - break; + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); - - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); +out: + if (ret) + sbi->revoked_atomic_block += fi->atomic_write_cnt; + else + sbi->committed_atomic_block += fi->atomic_write_cnt; - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); - } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); - } + __complete_revoke_list(inode, &revoke_list, ret ? true : false); - return err; + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); - - down_write(&fi->i_gc_rwsem[WRITE]); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } @@ -489,12 +374,12 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); } /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, false); if (!f2fs_is_checkpoint_ready(sbi)) return; @@ -504,12 +389,49 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, NULL_SEGNO); + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); + f2fs_gc(sbi, &gc_control); + } } } -void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) +{ + int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); + unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); + unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int threshold = sbi->blocks_per_seg * factor * + DEFAULT_DIRTY_THRESHOLD; + unsigned int global_threshold = threshold * 3 / 2; + + if (dents >= threshold || qdata >= threshold || + nodes >= threshold || meta >= threshold || + imeta >= threshold) + return true; + return dents + qdata + nodes + meta + imeta > global_threshold; +} + +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; @@ -527,47 +449,44 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi, REQ_TIME) && - (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) + if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || + excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) + goto do_sync; + + /* there is background inflight IO or foreground operation recently */ + if (is_inflight_io(sbi, REQ_TIME) || + (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; + /* exceed periodical checkpoint timeout threshold */ + if (f2fs_time_over(sbi, CP_TIME)) + goto do_sync; + /* checkpoint is the only way to shrink partial cached entries */ - if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) || - !f2fs_available_free_memory(sbi, INO_ENTRIES) || - excess_prefree_segs(sbi) || - excess_dirty_nats(sbi) || - excess_dirty_nodes(sbi) || - f2fs_time_over(sbi, CP_TIME)) { - if (test_opt(sbi, DATA_FLUSH)) { - struct blk_plug plug; - - mutex_lock(&sbi->flush_lock); - - blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); - blk_finish_plug(&plug); + if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && + f2fs_available_free_memory(sbi, INO_ENTRIES)) + return; - mutex_unlock(&sbi->flush_lock); - } - f2fs_sync_fs(sbi->sb, true); - stat_inc_bg_cp_count(sbi->stat_info); +do_sync: + if (test_opt(sbi, DATA_FLUSH) && from_bg) { + struct blk_plug plug; + + mutex_lock(&sbi->flush_lock); + + blk_start_plug(&plug); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); + blk_finish_plug(&plug); + + mutex_unlock(&sbi->flush_lock); } + f2fs_sync_fs(sbi->sb, 1); + stat_inc_bg_cp_count(sbi->stat_info); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio; - int ret; - - bio = f2fs_bio_alloc(sbi, 0, false); - if (!bio) - return -ENOMEM; - - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio_set_dev(bio, bdev); - ret = submit_bio_wait(bio); - bio_put(bio); + int ret = blkdev_issue_flush(bdev); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); @@ -601,8 +520,6 @@ repeat: if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -623,8 +540,6 @@ repeat: fcc->dispatch_list = NULL; } - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -661,7 +576,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) llist_add(&cmd.llnode, &fcc->issue_list); - /* update issue_list before we wake up issue_flush thread */ + /* + * update issue_list before we wake up issue_flush thread, this + * smp_mb() pairs with another barrier in ___wait_event(), see + * more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) @@ -726,7 +645,7 @@ init_thread: "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; } @@ -745,7 +664,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) kthread_stop(flush_thread); } if (free) { - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } @@ -757,12 +676,26 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) if (!f2fs_is_multi_device(sbi)) return 0; + if (test_opt(sbi, NOBARRIER)) + return 0; + for (i = 1; i < sbi->s_ndevs; i++) { + int count = DEFAULT_RETRY_IO_COUNT; + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; - ret = __submit_flush_wait(sbi, FDEV(i).bdev); - if (ret) + + do { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } while (ret && --count); + + if (ret) { + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); break; + } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); @@ -794,6 +727,18 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; + + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + block_t valid_blocks = + get_valid_blocks(sbi, segno, true); + + f2fs_bug_on(sbi, unlikely(!valid_blocks || + valid_blocks == CAP_BLKS_PER_SEC(sbi))); + + if (!IS_CURSEC(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); + } } } @@ -801,6 +746,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; @@ -812,13 +758,26 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, true) == 0) { + valid_blocks = get_valid_blocks(sbi, segno, true); + if (valid_blocks == 0) { clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); #ifdef CONFIG_F2FS_CHECK_FS clear_bit(segno, SIT_I(sbi)->invalid_segmap); #endif } + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || + valid_blocks == CAP_BLKS_PER_SEC(sbi)) { + clear_bit(secno, dirty_i->dirty_secmap); + return; + } + + if (!IS_CURSEC(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); + } } } @@ -831,20 +790,22 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; + unsigned int usable_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; + usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); - ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || - ckpt_valid_blocks == sbi->blocks_per_seg)) { + ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); - } else if (valid_blocks < sbi->blocks_per_seg) { + } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ @@ -887,9 +848,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) - holes[NODE] += sbi->blocks_per_seg - se->valid_blocks; + holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; else - holes[DATA] += sbi->blocks_per_seg - se->valid_blocks; + holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); @@ -921,7 +884,7 @@ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (get_ckpt_valid_blocks(sbi, segno)) + if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; @@ -942,7 +905,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, pend_list = &dcc->pend_list[plist_idx(len)]; - dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->lstart = lstart; @@ -1027,9 +990,9 @@ static void f2fs_submit_discard_endio(struct bio *bio) struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; unsigned long flags; - dc->error = blk_status_to_errno(bio->bi_status); - spin_lock_irqsave(&dc->lock, flags); + if (!dc->error) + dc->error = blk_status_to_errno(bio->bi_status); dc->bio_ref--; if (!dc->bio_ref && dc->state == D_SUBMIT) { dc->state = D_DONE; @@ -1070,39 +1033,43 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; dpolicy->ordered = false; dpolicy->granularity = granularity; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = MAX_PLIST_NUM; - dpolicy->timeout = 0; + dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; - dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + if (atomic_read(&dcc->discard_cmd_cnt)) + dpolicy->max_interval = + dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ dpolicy->granularity = 1; + dpolicy->timeout = true; } } @@ -1116,13 +1083,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, unsigned int *issued) { struct block_device *bdev = dc->bdev; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - int flag = dpolicy->sync ? REQ_SYNC : 0; + blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; int err = 0; @@ -1165,7 +1131,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); submit: if (err) { spin_lock_irqsave(&dc->lock, flags); @@ -1204,7 +1170,7 @@ submit: atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1); lstart += len; start += len; @@ -1212,12 +1178,14 @@ submit: len = total_len; } - if (!err && len) + if (!err && len) { + dcc->undiscard_blks -= len; __update_discard_tree_range(sbi, bdev, lstart, start, len); + } return err; } -static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, +static void __insert_discard_tree(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len, struct rb_node **insert_p, @@ -1226,7 +1194,6 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node **p; struct rb_node *parent = NULL; - struct discard_cmd *dc = NULL; bool leftmost = true; if (insert_p && insert_parent) { @@ -1238,12 +1205,8 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart, &leftmost); do_insert: - dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, + __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p, leftmost); - if (!dc) - return NULL; - - return dc; } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, @@ -1298,9 +1261,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, @@ -1460,6 +1422,8 @@ next: return issued; } +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy); static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) @@ -1468,15 +1432,17 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, issued = 0; + int i, issued; bool io_interrupted = false; - if (dpolicy->timeout != 0) - f2fs_update_time(sbi, dpolicy->timeout); + if (dpolicy->timeout) + f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); +retry: + issued = 0; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { - if (dpolicy->timeout != 0 && - f2fs_time_over(sbi, dpolicy->timeout)) + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (i + 1 < dpolicy->granularity) @@ -1492,13 +1458,13 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (dpolicy->timeout != 0 && - f2fs_time_over(sbi, dpolicy->timeout)) + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (dpolicy->io_aware && i < dpolicy->io_aware_gran && @@ -1520,6 +1486,11 @@ next: break; } + if (dpolicy->type == DPOLICY_UMOUNT && issued) { + __wait_all_discard_cmd(sbi, dpolicy); + goto retry; + } + if (!issued && io_interrupted) issued = -1; @@ -1580,33 +1551,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } @@ -1677,7 +1647,6 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); - dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -1694,14 +1663,21 @@ static int issue_discard_thread(void *data) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; - unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); do { - __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, - dcc->discard_granularity); + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + else + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || @@ -1725,9 +1701,8 @@ static int issue_discard_thread(void *data) wait_ms = dpolicy.max_interval; continue; } - - if (sbi->gc_mode == GC_URGENT) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + if (!atomic_read(&dcc->discard_cmd_cnt)) + continue; sb_start_intwrite(sbi->sb); @@ -1735,7 +1710,7 @@ static int issue_discard_thread(void *data) if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; - } else if (issued == -1){ + } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; @@ -1830,7 +1805,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } @@ -1855,7 +1831,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) || + !f2fs_block_unit_discard(sbi)) return false; if (!force) { @@ -1886,7 +1863,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, - GFP_F2FS_ZERO); + GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } @@ -1925,7 +1902,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) - __set_test_and_free(sbi, segno); + __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } @@ -1940,14 +1917,18 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); + bool section_alignment = F2FS_OPTION(sbi).discard_unit == + DISCARD_UNIT_SECTION; + + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; - if (need_align && end != -1) + if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) @@ -1955,7 +1936,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); - if (need_align) { + if (section_alignment) { start = rounddown(start, sbi->segs_per_sec); end = roundup(end, sbi->segs_per_sec); } @@ -1972,7 +1953,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, (end - 1) <= cpc->trim_end) continue; - if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) { + if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; @@ -1993,6 +1974,9 @@ next: } mutex_unlock(&dirty_i->seglist_lock); + if (!f2fs_block_unit_discard(sbi)) + goto wakeup; + /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; @@ -2026,12 +2010,29 @@ skip: dcc->nr_discards -= total_len; } +wakeup: wake_up_discard_thread(sbi, false); } -static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int err = 0; + + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) + err = PTR_ERR(dcc->f2fs_issue_discard); + + return err; +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ struct discard_cmd_control *dcc; int err = 0, i; @@ -2045,6 +2046,11 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + dcc->discard_granularity = sbi->blocks_per_seg; + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + dcc->discard_granularity = BLKS_PER_SEC(sbi); + INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); @@ -2056,6 +2062,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; + dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; + dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; + dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; @@ -2064,13 +2074,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: - dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, - "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) { - err = PTR_ERR(dcc->f2fs_issue_discard); - kvfree(dcc); + err = f2fs_start_discard_thread(sbi); + if (err) { + kfree(dcc); SM_I(sbi)->dcc_info = NULL; - return err; } return err; @@ -2092,7 +2099,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) f2fs_issue_discard_timeout(sbi); - kvfree(dcc); + kfree(dcc); SM_I(sbi)->dcc_info = NULL; } @@ -2112,11 +2119,45 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); } +static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int segno = GET_SEGNO(sbi, blkaddr); + + if (segno == NULL_SEGNO) + return 0; + return get_seg_entry(sbi, segno)->mtime; +} + +static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, + unsigned long long old_mtime) +{ + struct seg_entry *se; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long long ctime = get_mtime(sbi, false); + unsigned long long mtime = old_mtime ? old_mtime : ctime; + + if (segno == NULL_SEGNO) + return; + + se = get_seg_entry(sbi, segno); + + if (!se->mtime) + se->mtime = mtime; + else + se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, + se->valid_blocks + 1); + + if (ctime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = ctime; +} + static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; @@ -2133,13 +2174,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) || - (new_vblocks > sbi->blocks_per_seg))); + f2fs_bug_on(sbi, (new_vblocks < 0 || + (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi, false); - if (se->mtime > SIT_I(sbi)->max_mtime) - SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { @@ -2161,7 +2199,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* @@ -2203,7 +2242,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) } } - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -2228,10 +2268,12 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + f2fs_invalidate_compress_page(sbi, addr); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); + update_segment_mtime(sbi, addr, 0); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ @@ -2272,6 +2314,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); } @@ -2311,7 +2354,9 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); + if (unlikely(f2fs_cp_error(sbi))) + return ERR_PTR(-EIO); + return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, @@ -2356,9 +2401,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +static int is_next_segment_free(struct f2fs_sb_info *sbi, + struct curseg_info *curseg, int type) { - struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); @@ -2396,8 +2441,8 @@ find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); if (secno >= MAIN_SECS(sbi)) { if (dir == ALLOC_RIGHT) { - secno = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); } else { go_left = 1; @@ -2412,8 +2457,8 @@ find_other_zone: left_start--; continue; } - left_start = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + left_start = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); break; } @@ -2462,7 +2507,9 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; + unsigned short seg_type = curseg->seg_type; + curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; @@ -2470,24 +2517,38 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); - if (IS_DATASEG(type)) + + sanity_check_seg_type(sbi, seg_type); + + if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); - if (IS_NODESEG(type)) + if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); - __set_sit_entry_type(sbi, type, curseg->segno, modified); + __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; + + sanity_check_seg_type(sbi, seg_type); + if (f2fs_need_rand_seg(sbi)) + return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) - return CURSEG_I(sbi, type)->segno; + return curseg->segno; + + /* inmem log may not locate on any segment after mount */ + if (!curseg->inited) + return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (test_opt(sbi, NOHEAP) && - (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) @@ -2497,7 +2558,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; - return CURSEG_I(sbi, type)->segno; + return curseg->segno; } /* @@ -2507,12 +2568,14 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; unsigned int segno = curseg->segno; int dir = ALLOC_LEFT; - write_sum_page(sbi, curseg->sum_blk, + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); - if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) + if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA) dir = ALLOC_RIGHT; if (test_opt(sbi, NOHEAP)) @@ -2523,24 +2586,25 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + curseg->fragment_remained_chunk = + prandom_u32_max(sbi->max_fragment_chunk) + 1; } -static void __next_free_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg, block_t start) +static int __next_free_blkoff(struct f2fs_sb_info *sbi, + int segno, block_t start) { - struct seg_entry *se = get_seg_entry(sbi, seg->segno); + struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; + int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); - - seg->next_blkoff = pos; + return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); } /* @@ -2551,17 +2615,34 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { - if (seg->alloc_type == SSR) - __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); - else + if (seg->alloc_type == SSR) { + seg->next_blkoff = + __next_free_blkoff(sbi, seg->segno, + seg->next_blkoff + 1); + } else { seg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk <= 0) { + seg->fragment_remained_chunk = + prandom_u32_max(sbi->max_fragment_chunk) + 1; + seg->next_blkoff += + prandom_u32_max(sbi->max_fragment_hole) + 1; + } + } + } +} + +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) +{ + return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg; } /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type) +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2569,8 +2650,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) struct f2fs_summary_block *sum_node; struct page *sum_page; - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + if (flush) + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); @@ -2580,32 +2663,142 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; - __next_free_blkoff(sbi, curseg, 0); + curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); - f2fs_bug_on(sbi, IS_ERR(sum_page)); + if (IS_ERR(sum_page)) { + /* GC won't be able to use stale summary pages by cp_error */ + memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + return; + } sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); } -static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age); + +static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, + int target_type, int alloc_mode, + unsigned long long age) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + curseg->seg_type = target_type; + + if (get_ssr_segment(sbi, type, alloc_mode, age)) { + struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); + + curseg->seg_type = se->type; + change_curseg(sbi, type, true); + } else { + /* allocate cold segment by default */ + curseg->seg_type = CURSEG_COLD_DATA; + new_curseg(sbi, type, true); + } + stat_inc_seg_type(sbi, curseg); +} + +static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); + + if (!sbi->am.atgc_enabled) + return; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); + + up_write(&SIT_I(sbi)->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + f2fs_up_read(&SM_I(sbi)->curseg_lock); + +} +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_init_atgc_curseg(sbi); +} + +static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + + if (get_valid_blocks(sbi, curseg->segno, false)) { + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + } else { + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_free(sbi, curseg->segno, true); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + } +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + if (get_valid_blocks(sbi, curseg->segno, false)) + goto out; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_inuse(sbi, curseg->segno); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; unsigned segno = NULL_SEGNO; + unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; + sanity_check_seg_type(sbi, seg_type); + /* f2fs_need_SSR() already forces to do this */ - if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ - if (IS_NODESEG(type)) { - if (type >= CURSEG_WARM_NODE) { + if (IS_NODESEG(seg_type)) { + if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { @@ -2613,7 +2806,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } cnt = NR_CURSEG_NODE_TYPE; } else { - if (type >= CURSEG_WARM_DATA) { + if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { @@ -2623,9 +2816,9 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } for (; cnt-- > 0; reversed ? i-- : i++) { - if (i == type) + if (i == seg_type) continue; - if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { curseg->next_segno = segno; return 1; } @@ -2654,26 +2847,28 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - type == CURSEG_WARM_NODE) + curseg->seg_type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) && + else if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + else if (f2fs_need_SSR(sbi) && + get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, false); stat_inc_seg_type(sbi, curseg); } -void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, +void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); @@ -2681,8 +2876,8 @@ void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, if (segno < start || segno > end) goto unlock; - if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, true); @@ -2697,32 +2892,55 @@ unlock: type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, + bool new_sec, bool force) { - struct curseg_info *curseg; + struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; - int i; - down_write(&SIT_I(sbi)->sentry_lock); + if (!curseg->inited) + goto alloc; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - if (type != NO_CHECK_TYPE && i != type) - continue; + if (force || curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, new_sec)) + goto alloc; - curseg = CURSEG_I(sbi, i); - if (type == NO_CHECK_TYPE || curseg->next_blkoff || - get_valid_blocks(sbi, curseg->segno, false) || - get_ckpt_valid_blocks(sbi, curseg->segno)) { - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); - locate_dirty_segment(sbi, old_segno); - } - } + if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + return; +alloc: + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + locate_dirty_segment(sbi, old_segno); +} + +static void __allocate_new_section(struct f2fs_sb_info *sbi, + int type, bool force) +{ + __allocate_new_segment(sbi, type, true, force); +} + +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) +{ + f2fs_down_read(&SM_I(sbi)->curseg_lock); + down_write(&SIT_I(sbi)->sentry_lock); + __allocate_new_section(sbi, type, force); + up_write(&SIT_I(sbi)->sentry_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); +} +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) +{ + int i; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + down_write(&SIT_I(sbi)->sentry_lock); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) + __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } static const struct segment_allocation default_salloc_ops = { @@ -2766,7 +2984,7 @@ next: mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, @@ -2801,7 +3019,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, HZ/50); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: @@ -2830,7 +3048,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; - bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); + bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -2860,9 +3078,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); if (err) goto out; @@ -2890,12 +3108,11 @@ out: return err; } -static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +static bool __has_curseg_space(struct f2fs_sb_info *sbi, + struct curseg_info *curseg) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - if (curseg->next_blkoff < sbi->blocks_per_seg) - return true; - return false; + return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, + curseg->segno); } int f2fs_rw_hint_to_seg_type(enum rw_hint hint) @@ -2910,101 +3127,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } -/* This returns write hints for each segment type. This hints will be - * passed down to block layer. There are mapping tables which depend on - * the mount option 'whint_mode'. - * - * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - * - * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_NOT_SET - * HOT_NODE " - * WARM_NODE " - * COLD_NODE " - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - * - * 3) whint_mode=fs-based. F2FS passes down hints with its policy. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_MEDIUM; - * HOT_NODE WRITE_LIFE_NOT_SET - * WARM_NODE " - * COLD_NODE WRITE_LIFE_NONE - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - */ - -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp) -{ - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_NOT_SET; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else { - return WRITE_LIFE_NOT_SET; - } - } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_LONG; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else if (type == NODE) { - if (temp == WARM || temp == HOT) - return WRITE_LIFE_NOT_SET; - else if (temp == COLD) - return WRITE_LIFE_NONE; - } else if (type == META) { - return WRITE_LIFE_MEDIUM; - } - } - return WRITE_LIFE_NOT_SET; -} - static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -3035,13 +3157,22 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page) || file_is_cold(inode) || - f2fs_compressed_file(inode)) + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return CURSEG_COLD_DATA_PINNED; + + if (page_private_gcing(fio->page)) { + if (fio->sbi->am.atgc_enabled && + (fio->io_type == FS_DATA_IO) && + (fio->sbi->gc_mode != GC_URGENT_HIGH)) + return CURSEG_ALL_DATA_ATGC; + else + return CURSEG_COLD_DATA; + } + if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_cow_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -3082,31 +3213,29 @@ static int __get_segment_type(struct f2fs_io_info *fio) void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio, bool add_list) + struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); - bool put_pin_sem = false; - - if (type == CURSEG_COLD_DATA) { - /* GC during CURSEG_COLD_DATA_PINNED allocation */ - if (down_read_trylock(&sbi->pin_sem)) { - put_pin_sem = true; - } else { - type = CURSEG_WARM_DATA; - curseg = CURSEG_I(sbi, type); - } - } else if (type == CURSEG_COLD_DATA_PINNED) { - type = CURSEG_COLD_DATA; - } + unsigned long long old_mtime; + bool from_gc = (type == CURSEG_ALL_DATA_ATGC); + struct seg_entry *se = NULL; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); + if (from_gc) { + f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); + se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); + sanity_check_seg_type(sbi, se->type); + f2fs_bug_on(sbi, IS_NODESEG(se->type)); + } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); /* @@ -3120,6 +3249,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (from_gc) { + old_mtime = get_segment_mtime(sbi, old_blkaddr); + } else { + update_segment_mtime(sbi, old_blkaddr, 0); + old_mtime = 0; + } + update_segment_mtime(sbi, *new_blkaddr, old_mtime); + /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. @@ -3128,9 +3265,13 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - + if (!__has_curseg_space(sbi, curseg)) { + if (from_gc) + get_atssr_segment(sbi, type, se->type, + AT_SSR, se->mtime); + else + sit_i->s_ops->allocate_segment(sbi, type, false); + } /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous @@ -3147,12 +3288,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_inode_chksum_set(sbi, page); } - if (F2FS_IO_ALIGNED(sbi)) - fio->retry = false; - - if (add_list) { + if (fio) { struct f2fs_bio_info *io; + if (F2FS_IO_ALIGNED(sbi)) + fio->retry = false; + INIT_LIST_HEAD(&fio->list); fio->in_list = true; io = sbi->write_io[fio->type] + fio->temp; @@ -3163,46 +3304,51 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); - - if (put_pin_sem) - up_read(&sbi->pin_sem); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } -static void update_device_state(struct f2fs_io_info *fio) +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt) { - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int devidx; - if (!f2fs_is_multi_device(sbi)) return; - devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + while (1) { + unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); + unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; - /* update device state for fsync */ - f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); - /* update device state for checkpoint */ - if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { - spin_lock(&sbi->dev_lock); - f2fs_set_bit(devidx, (char *)&sbi->dirty_device); - spin_unlock(&sbi->dev_lock); + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + if (blkcnt <= blks) + break; + blkcnt -= blks; + blkaddr += blks; } } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); - bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); if (keep_order) - down_read(&fio->sbi->io_order_lock); + f2fs_down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio, true); - if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + &fio->new_blkaddr, sum, type, fio); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); + f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr); + } /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3211,10 +3357,10 @@ reallocate: goto reallocate; } - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); if (keep_order) - up_read(&fio->sbi->io_order_lock); + f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, @@ -3241,7 +3387,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); - f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -3251,7 +3397,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, @@ -3265,7 +3411,7 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); - f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) @@ -3284,9 +3430,20 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); + goto drop_bio; + } + + if (f2fs_cp_error(sbi)) { + err = -EIO; + goto drop_bio; } + if (fio->post_read) + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) @@ -3294,11 +3451,22 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) else err = f2fs_submit_page_bio(fio); if (!err) { - update_device_state(fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_device_state(fio->sbi, fio->ino, + fio->new_blkaddr, 1); + f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + fio->io_type, F2FS_BLKSIZE); } return err; +drop_bio: + if (fio->bio && *(fio->bio)) { + struct bio *bio = *(fio->bio); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + *(fio->bio) = NULL; + } + return err; } static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, @@ -3315,7 +3483,8 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr) + bool recover_curseg, bool recover_newaddr, + bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -3323,12 +3492,13 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct seg_entry *se; int type; unsigned short old_blkoff; + unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; - down_write(&SM_I(sbi)->curseg_lock); + f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ @@ -3356,21 +3526,28 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; + old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg || recover_newaddr) + if (!recover_curseg || recover_newaddr) { + if (!from_gc) + update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); + } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); + if (!from_gc) + update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } @@ -3382,14 +3559,15 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; + curseg->alloc_type = old_alloc_type; } up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - up_write(&SM_I(sbi)->curseg_lock); + f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -3402,7 +3580,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, - recover_curseg, recover_newaddr); + recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } @@ -3447,10 +3625,16 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; + if (!f2fs_post_read_required(inode)) + return; + for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); + + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) @@ -3496,6 +3680,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; @@ -3534,7 +3719,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) - blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { @@ -3558,6 +3743,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; @@ -3612,8 +3798,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } if (__exist_node_summaries(sbi)) - f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP, true); + f2fs_ra_meta_pages(sbi, + sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), + NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -3624,7 +3811,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { - f2fs_err(sbi, "invalid journal entries nats %u sits %u\n", + f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } @@ -3658,6 +3845,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); if (sbi->ckpt->alloc_type[i] == SSR) blkoff = sbi->blocks_per_seg; @@ -3694,6 +3882,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; + if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else @@ -3741,7 +3930,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -3766,7 +3955,8 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); + f2fs_kmem_cache_alloc(sit_entry_set_slab, + GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); @@ -3788,10 +3978,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) @@ -3977,6 +4169,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; + unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); @@ -3999,9 +4192,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) @@ -4021,8 +4214,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap += SIT_VBLOCK_MAP_SIZE; #endif - sit_i->sentries[start].discard_map = bitmap; - bitmap += SIT_VBLOCK_MAP_SIZE; + if (discard_map) { + sit_i->sentries[start].discard_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -4071,7 +4266,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = ktime_get_real_seconds(); + sit_i->mounted_time = ktime_get_boottime_seconds(); init_rwsem(&sit_i->sentry_lock); return 0; } @@ -4115,14 +4310,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), - GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, + sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; - for (i = 0; i < NR_CURSEG_TYPE; i++) { + for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) @@ -4132,8 +4327,15 @@ static int build_curseg(struct f2fs_sb_info *sbi) sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; + if (i < NR_PERSISTENT_LOG) + array[i].seg_type = CURSEG_HOT_DATA + i; + else if (i == CURSEG_COLD_DATA_PINNED) + array[i].seg_type = CURSEG_COLD_DATA; + else if (i == CURSEG_ALL_DATA_ATGC) + array[i].seg_type = CURSEG_COLD_DATA; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; + array[i].inited = false; } return restore_curseg_summaries(sbi); } @@ -4149,10 +4351,10 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; - block_t total_node_blocks = 0; + block_t sit_valid_blocks[2] = {0, 0}; do { - readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; @@ -4174,20 +4376,30 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; - /* build discard map only one time */ - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); + return -EFSCORRUPTED; + } + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; + + if (f2fs_block_unit_discard(sbi)) { + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (__is_large_section(sbi)) @@ -4206,6 +4418,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } @@ -4213,23 +4426,33 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; - if (IS_NODESEG(se->type)) - total_node_blocks -= old_valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); + break; + } + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; + + if (f2fs_block_unit_discard(sbi)) { + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; + } } if (__is_large_section(sbi)) { @@ -4241,22 +4464,38 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } up_read(&curseg->journal_rwsem); - if (!err && total_node_blocks != valid_node_count(sbi)) { + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", - total_node_blocks, valid_node_count(sbi)); - err = -EFSCORRUPTED; + sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); + return -EFSCORRUPTED; } - return err; + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; + struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { - struct seg_entry *sentry = get_seg_entry(sbi, start); + if (f2fs_usable_blks_in_seg(sbi, start) == 0) + continue; + sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else @@ -4267,6 +4506,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); } } @@ -4275,8 +4515,8 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0; - unsigned short valid_blocks; + unsigned int segno = 0, offset = 0, secno; + block_t valid_blocks, usable_blks_in_seg; while (1) { /* find dirty segment based on free segmap */ @@ -4285,9 +4525,10 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); - if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + if (valid_blocks == usable_blks_in_seg || !valid_blocks) continue; - if (valid_blocks > sbi->blocks_per_seg) { + if (valid_blocks > usable_blks_in_seg) { f2fs_bug_on(sbi, 1); continue; } @@ -4295,6 +4536,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); } + + if (!__is_large_section(sbi)) + return; + + mutex_lock(&dirty_i->seglist_lock); + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + valid_blocks = get_valid_blocks(sbi, segno, true); + secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) + continue; + if (IS_CURSEC(sbi, secno)) + continue; + set_bit(secno, dirty_i->dirty_secmap); + } + mutex_unlock(&dirty_i->seglist_lock); } static int init_victim_secmap(struct f2fs_sb_info *sbi) @@ -4305,6 +4562,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -4331,6 +4595,14 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) return -ENOMEM; } + if (__is_large_section(sbi)) { + bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + dirty_i->dirty_secmap = f2fs_kvzalloc(sbi, + bitmap_size, GFP_KERNEL); + if (!dirty_i->dirty_secmap) + return -ENOMEM; + } + init_dirty_segmap(sbi); return init_victim_secmap(sbi); } @@ -4343,11 +4615,25 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; * In LFS curseg, all blkaddr after .next_blkoff should be unused. */ - for (i = 0; i < NO_CHECK_TYPE; i++) { + for (i = 0; i < NR_PERSISTENT_LOG; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; + if (f2fs_sb_has_readonly(sbi) && + i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) + continue; + + sanity_check_seg_type(sbi, curseg->seg_type); + + if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { + f2fs_err(sbi, + "Current segment has invalid alloc_type:%d", + curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); + return -EFSCORRUPTED; + } + if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; @@ -4362,6 +4648,7 @@ out: "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } @@ -4471,7 +4758,8 @@ static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, } static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ memcpy(data, zone, sizeof(struct blk_zone)); return 0; } @@ -4523,7 +4811,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) f2fs_notice(sbi, "Assign new section to curseg[%d]: " "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); - allocate_segment_by_default(sbi, type, true); + + f2fs_allocate_new_section(sbi, type, true); /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) @@ -4572,7 +4861,7 @@ int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; - for (i = 0; i < NO_CHECK_TYPE; i++) { + for (i = 0; i < NR_PERSISTENT_LOG; i++) { ret = fix_curseg_write_pointer(sbi, i); if (ret) return ret; @@ -4587,8 +4876,10 @@ struct check_zone_write_pointer_args { }; static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ struct check_zone_write_pointer_args *args; + args = (struct check_zone_write_pointer_args *)data; return check_zone_write_pointer(args->sbi, args->fdev, zone); @@ -4613,6 +4904,94 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } + +static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx, + unsigned int dev_idx) +{ + if (!bdev_is_zoned(FDEV(dev_idx).bdev)) + return true; + return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq); +} + +/* Return the zone index in the given device */ +static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, + int dev_idx) +{ + block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + + return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >> + sbi->log_blocks_per_blkz; +} + +/* + * Return the usable segments in a section based on the zone's + * corresponding zone capacity. Zone is equal to a section. + */ +static inline unsigned int f2fs_usable_zone_segs_in_sec( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + unsigned int dev_idx, zone_idx; + + dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); + zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx); + + /* Conventional zone's capacity is always equal to zone size */ + if (is_conv_zone(sbi, zone_idx, dev_idx)) + return sbi->segs_per_sec; + + if (!sbi->unusable_blocks_per_sec) + return sbi->segs_per_sec; + + /* Get the segment count beyond zone capacity block */ + return sbi->segs_per_sec - (sbi->unusable_blocks_per_sec >> + sbi->log_blocks_per_seg); +} + +/* + * Return the number of usable blocks in a segment. The number of blocks + * returned is always equal to the number of blocks in a segment for + * segments fully contained within a sequential zone capacity or a + * conventional zone. For segments partially contained in a sequential + * zone capacity, the number of usable blocks up to the zone capacity + * is returned. 0 is returned in all other cases. + */ +static inline unsigned int f2fs_usable_zone_blks_in_seg( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; + unsigned int zone_idx, dev_idx, secno; + + secno = GET_SEC_FROM_SEG(sbi, segno); + seg_start = START_BLOCK(sbi, segno); + dev_idx = f2fs_target_device_index(sbi, seg_start); + zone_idx = get_zone_idx(sbi, secno, dev_idx); + + /* + * Conventional zone's capacity is always equal to zone size, + * so, blocks per segment is unchanged. + */ + if (is_conv_zone(sbi, zone_idx, dev_idx)) + return sbi->blocks_per_seg; + + if (!sbi->unusable_blocks_per_sec) + return sbi->blocks_per_seg; + + sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); + + /* + * If segment starts before zone capacity and spans beyond + * zone capacity, then usable blocks are from seg start to + * zone capacity. If the segment starts after the zone capacity, + * then there are no usable blocks. + */ + if (seg_start >= sec_cap_blkaddr) + return 0; + if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr) + return sec_cap_blkaddr - seg_start; + + return sbi->blocks_per_seg; +} #else int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { @@ -4623,7 +5002,36 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) { return 0; } + +static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} + +static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} #endif +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_blks_in_seg(sbi, segno); + + return sbi->blocks_per_seg; +} + +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_segs_in_sec(sbi, segno); + + return sbi->segs_per_sec; +} /* * Update min, max modified time for cost-benefit GC algorithm @@ -4650,6 +5058,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi, false); + sit_i->dirty_max_mtime = 0; up_write(&sit_i->sentry_lock); } @@ -4678,17 +5087,17 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; - if (!test_opt(sbi, LFS)) + if (!f2fs_lfs_mode(sbi)) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; + sm_info->min_seq_blocks = sbi->blocks_per_seg; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); INIT_LIST_HEAD(&sm_info->sit_entry_set); - init_rwsem(&sm_info->curseg_lock); + init_f2fs_rwsem(&sm_info->curseg_lock); if (!f2fs_readonly(sbi->sb)) { err = f2fs_create_flush_cmd_control(sbi); @@ -4742,6 +5151,8 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } @@ -4757,9 +5168,15 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); + if (__is_large_section(sbi)) { + mutex_lock(&dirty_i->seglist_lock); + kvfree(dirty_i->dirty_secmap); + mutex_unlock(&dirty_i->seglist_lock); + } + destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; - kvfree(dirty_i); + kfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) @@ -4771,21 +5188,22 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { - kvfree(array[i].sum_blk); - kvfree(array[i].journal); + kfree(array[i].sum_blk); + kfree(array[i].journal); } - kvfree(array); + kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) return; SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); - kvfree(free_i); + kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) @@ -4797,7 +5215,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) kvfree(sit_i->bitmap); - kvfree(sit_i->tmp_map); + kfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); @@ -4809,7 +5227,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif - kvfree(sit_i); + kfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) @@ -4825,29 +5243,29 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; - kvfree(sm_info); + kfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) { - discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) goto fail; - discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", sizeof(struct discard_cmd)); if (!discard_cmd_slab) goto destroy_discard_entry; - sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", + sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -4866,5 +5284,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 459dc3901a57..be8f2d7d007b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/segment.h * @@ -16,13 +16,21 @@ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ +#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) + +static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, + unsigned short seg_type) +{ + f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); +} #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) @@ -34,7 +42,9 @@ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -48,7 +58,11 @@ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - (sbi)->segs_per_sec)) \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ + (sbi)->segs_per_sec)) #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ @@ -87,12 +101,15 @@ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define CAP_BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ + (sbi)->unusable_blocks_per_sec) #define GET_SEC_FROM_SEG(sbi, segno) \ - ((segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - ((secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) @@ -129,23 +146,28 @@ enum { }; /* - * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into + * fragmented segment which has similar aging degree. */ enum { LFS = 0, - SSR + SSR, + AT_SSR, }; /* - * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. + * GC_AT is based on age-threshold algorithm. */ enum { GC_CB = 0, GC_GREEDY, + GC_AT, ALLOC_NEXT, FLUSH_DEVICE, MAX_GC_POLICY, @@ -154,24 +176,28 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. - * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, FG_GC, - FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ - unsigned long *dirty_segmap; /* dirty segment bitmap */ - unsigned int max_search; /* maximum # of segments to search */ + unsigned long *dirty_bitmap; /* dirty segment/section bitmap */ + unsigned int max_search; /* + * maximum # of segments/sections + * to search + */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ + unsigned long long oldest_age; /* oldest age of segments having the same min cost */ unsigned int min_segno; /* segment # having min. cost */ + unsigned long long age; /* mtime of GCed section*/ + unsigned long long age_threshold;/* age threshold */ }; struct seg_entry { @@ -184,7 +210,7 @@ struct seg_entry { unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ #endif /* - * # of valid blocks and the validity bitmap stored in the the last + * # of valid blocks and the validity bitmap stored in the last * checkpoint pack. This information is used by the SSR mode. */ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ @@ -202,10 +228,10 @@ struct segment_allocation { #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { @@ -237,6 +263,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ + unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; @@ -266,15 +294,19 @@ enum dirty_type { struct dirty_seglist_info { const struct victim_selection *v_ops; /* victim selction operation */ unsigned long *dirty_segmap[NR_DIRTY_TYPE]; + unsigned long *dirty_secmap; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ struct victim_selection { int (*get_victim)(struct f2fs_sb_info *, unsigned int *, - int, int, char); + int, int, char, unsigned long long); }; /* for active log information */ @@ -284,10 +316,13 @@ struct curseg_info { struct rw_semaphore journal_rwsem; /* protect journal area */ struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ + unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ + bool inited; /* indicate inmem log is inited */ }; struct sit_entry_set { @@ -301,8 +336,6 @@ struct sit_entry_set { */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { - if (type == CURSEG_COLD_DATA_PINNED) - type = CURSEG_COLD_DATA; return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } @@ -334,8 +367,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, } static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool use_section) { + if (use_section && __is_large_section(sbi)) { + unsigned int start_segno = START_SEGNO(segno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + return blocks; + } return get_seg_entry(sbi, segno)->ckpt_valid_blocks; } @@ -407,6 +452,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); @@ -414,7 +460,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } @@ -434,22 +480,23 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool inmem) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - if (IS_CURSEC(sbi, secno)) + if (!inmem && IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; } @@ -496,9 +543,10 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) return FREE_I(sbi)->free_segments; } -static inline int reserved_segments(struct f2fs_sb_info *sbi) +static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments; + return SM_I(sbi)->reserved_segments + + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) @@ -528,22 +576,21 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); + return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + - get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; int i; /* check current node segment */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; - left_blocks = sbi->blocks_per_seg - - get_seg_entry(sbi, segno)->ckpt_valid_blocks; + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (node_blocks > left_blocks) return false; @@ -551,7 +598,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) /* check current data segment */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; - left_blocks = sbi->blocks_per_seg - + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (dent_blocks > left_blocks) return false; @@ -561,19 +608,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && - has_curseg_enough_space(sbi)) + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) return false; - return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + needed); + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) @@ -610,7 +666,9 @@ static inline int utilization(struct f2fs_sb_info *sbi) * pages over min_fsync_blocks. (=default option) * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. * F2FS_IPU_NOCACHE - disable IPU bio cache. - * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode) + * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has + * FI_OPU_WRITE flag. + * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 @@ -626,6 +684,7 @@ enum { F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, + F2FS_IPU_HONOR_OPU_WRITE, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, @@ -673,35 +732,43 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; + unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); /* check bitmap with valid block count */ do { if (is_valid) { next_pos = find_next_zero_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); valid_blocks += next_pos - cur_pos; } else next_pos = find_next_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); cur_pos = next_pos; is_valid = !is_valid; - } while (cur_pos < sbi->blocks_per_seg); + } while (cur_pos < usable_blks_per_seg); if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } + if (usable_blks_per_seg < sbi->blocks_per_seg) + f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + usable_blks_per_seg) != sbi->blocks_per_seg); + /* check segment usage, and check boundary of a given segment number */ - if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg || segno > TOTAL_SEGS(sbi) - 1)) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; @@ -756,7 +823,7 @@ static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, bool base_time) { struct sit_info *sit_i = SIT_I(sbi); - time64_t diff, now = ktime_get_real_seconds(); + time64_t diff, now = ktime_get_boottime_seconds(); if (now >= sit_i->mounted_time) return sit_i->elapsed_time + now - sit_i->mounted_time; @@ -816,7 +883,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * BIO_MAX_PAGES; + return 8 * BIO_MAX_VECS; else return 0; } @@ -833,7 +900,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - desired = BIO_MAX_PAGES; + desired = BIO_MAX_VECS; if (type == NODE) desired <<= 1; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index a467aca29cfe..dd3c3c7a90ec 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -18,9 +18,7 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; - - return count > 0 ? count : 0; + return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) @@ -58,7 +56,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, /* count extent cache entries */ count += __count_extent_cache(sbi); - /* shrink clean nat cache entries */ + /* count clean nat cache entries */ count += __count_nat_entries(sbi); /* count free nids cache entries */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 65a7a432dfee..3834ead04620 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -8,9 +8,10 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/sched/mm.h> #include <linux/statfs.h> #include <linux/buffer_head.h> -#include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/mount.h> @@ -24,13 +25,16 @@ #include <linux/sysfs.h> #include <linux/quota.h> #include <linux/unicode.h> +#include <linux/part_stat.h> +#include <linux/zstd.h> +#include <linux/lz4.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" #include "gc.h" -#include "trace.h" +#include "iostat.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> @@ -44,7 +48,6 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", @@ -55,6 +58,9 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_CHECKPOINT] = "checkpoint error", [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -133,17 +139,28 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, - Opt_whint, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, + Opt_inlinecrypt, Opt_checkpoint_disable, Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_nocompress_extension, + Opt_compress_chksum, + Opt_compress_mode, + Opt_compress_cache, + Opt_atgc, + Opt_gc_merge, + Opt_nogc_merge, + Opt_discard_unit, + Opt_memory_mode, Opt_err, }; @@ -198,17 +215,29 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_inlinecrypt, "inlinecrypt"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_nocompress_extension, "nocompress_extension=%s"}, + {Opt_compress_chksum, "compress_chksum"}, + {Opt_compress_mode, "compress_mode=%s"}, + {Opt_compress_cache, "compress_cache"}, + {Opt_atgc, "atgc"}, + {Opt_gc_merge, "gc_merge"}, + {Opt_nogc_merge, "nogc_merge"}, + {Opt_discard_unit, "discard_unit=%s"}, + {Opt_memory_mode, "memory=%s"}, {Opt_err, NULL}, }; @@ -229,42 +258,53 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) va_end(args); } -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) static const struct f2fs_sb_encodings { __u16 magic; char *name; - char *version; + unsigned int version; } f2fs_sb_encoding_map[] = { - {F2FS_ENC_UTF8_12_1, "utf8", "12.1.0"}, + {F2FS_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; -static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb, - const struct f2fs_sb_encodings **encoding, - __u16 *flags) +static const struct f2fs_sb_encodings * +f2fs_sb_read_encoding(const struct f2fs_super_block *sb) { __u16 magic = le16_to_cpu(sb->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++) if (magic == f2fs_sb_encoding_map[i].magic) - break; - - if (i >= ARRAY_SIZE(f2fs_sb_encoding_map)) - return -EINVAL; + return &f2fs_sb_encoding_map[i]; - *encoding = &f2fs_sb_encoding_map[i]; - *flags = le16_to_cpu(sb->s_encoding_flags); + return NULL; +} +struct kmem_cache *f2fs_cf_name_slab; +static int __init f2fs_create_casefold_cache(void) +{ + f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", + F2FS_NAME_LEN); + if (!f2fs_cf_name_slab) + return -ENOMEM; return 0; } + +static void f2fs_destroy_casefold_cache(void) +{ + kmem_cache_destroy(f2fs_cf_name_slab); +} +#else +static int __init f2fs_create_casefold_cache(void) { return 0; } +static void f2fs_destroy_casefold_cache(void) { } #endif static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count << 1) / 1000, + block_t limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); - /* limit is 0.2% */ + /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && F2FS_OPTION(sbi).root_reserved_blocks > limit) { F2FS_OPTION(sbi).root_reserved_blocks = limit; @@ -283,6 +323,62 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).s_resgid)); } +static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) +{ + unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; + unsigned int avg_vblocks; + unsigned int wanted_reserved_segments; + block_t avail_user_block_count; + + if (!F2FS_IO_ALIGNED(sbi)) + return 0; + + /* average valid block count in section in worst case */ + avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); + + /* + * we need enough free space when migrating one section in worst case + */ + wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * + reserved_segments(sbi); + wanted_reserved_segments -= reserved_segments(sbi); + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks - + F2FS_OPTION(sbi).root_reserved_blocks; + + if (wanted_reserved_segments * sbi->blocks_per_seg > + avail_user_block_count) { + f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", + wanted_reserved_segments, + avail_user_block_count >> sbi->log_blocks_per_seg); + return -ENOSPC; + } + + SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; + + f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", + wanted_reserved_segments); + + return 0; +} + +static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) +{ + if (!F2FS_OPTION(sbi).unusable_cap_perc) + return; + + if (F2FS_OPTION(sbi).unusable_cap_perc == 100) + F2FS_OPTION(sbi).unusable_cap = sbi->user_block_count; + else + F2FS_OPTION(sbi).unusable_cap = (sbi->user_block_count / 100) * + F2FS_OPTION(sbi).unusable_cap_perc; + + f2fs_info(sbi, "Adjust unusable cap for checkpoint=disable = %u / %u%%", + F2FS_OPTION(sbi).unusable_cap, + F2FS_OPTION(sbi).unusable_cap_perc); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -330,7 +426,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, set_opt(sbi, QUOTA); return 0; errout: - kvfree(qname); + kfree(qname); return ret; } @@ -342,7 +438,7 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); return -EINVAL; } - kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -393,24 +489,190 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int parse_options(struct super_block *sb, char *options) +static int f2fs_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct fs_parameter param = { + .type = fs_value_is_string, + .string = arg->from ? arg->from : "", + }; + struct fscrypt_dummy_policy *policy = + &F2FS_OPTION(sbi).dummy_enc_policy; + int err; + + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; + } + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !fscrypt_is_dummy_policy_set(policy)) { + f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + + err = fscrypt_parse_test_dummy_encryption(¶m, policy); + if (err) { + if (err == -EEXIST) + f2fs_warn(sbi, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", + opt); + else + f2fs_warn(sbi, "Error processing option \"%s\" [%d]", + opt, err); + return -EINVAL; + } + err = fscrypt_add_test_dummy_key(sb, policy); + if (err) { + f2fs_warn(sbi, "Error adding test dummy encryption key [%d]", + err); + return err; + } + f2fs_warn(sbi, "Test dummy encryption mode enabled"); + return 0; +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +/* + * 1. The same extension name cannot not appear in both compress and non-compress extension + * at the same time. + * 2. If the compress extension specifies all files, the types specified by the non-compress + * extension will be treated as special cases and will not be compressed. + * 3. Don't allow the non-compress extension specifies all files. + */ +static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt, index = 0, no_index = 0; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (!noext_cnt) + return 0; + + for (no_index = 0; no_index < noext_cnt; no_index++) { + if (!strcasecmp("*", noext[no_index])) { + f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + return -EINVAL; + } + for (index = 0; index < ext_cnt; index++) { + if (!strcasecmp(ext[index], noext[no_index])) { + f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + ext[index]); + return -EINVAL; + } + } + } + return 0; +} + +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +#endif + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + +#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!level || level > zstd_max_clevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + +static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; +#ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt; +#endif char *p, *name; - int arg = 0, ext_cnt; + int arg = 0; kuid_t uid; kgid_t gid; -#ifdef CONFIG_QUOTA int ret; -#endif if (!options) - return 0; + goto default_check; while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; /* @@ -426,35 +688,36 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 2 && !strncmp(name, "on", 2)) { - set_opt(sbi, BG_GC); - clear_opt(sbi, FORCE_FG_GC); - } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { - clear_opt(sbi, BG_GC); - clear_opt(sbi, FORCE_FG_GC); - } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { - set_opt(sbi, BG_GC); - set_opt(sbi, FORCE_FG_GC); + if (!strcmp(name, "on")) { + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + } else if (!strcmp(name, "off")) { + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; + } else if (!strcmp(name, "sync")) { + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_disable_roll_forward: set_opt(sbi, DISABLE_ROLL_FORWARD); break; case Opt_norecovery: /* this option mounts f2fs with ro */ - set_opt(sbi, DISABLE_ROLL_FORWARD); + set_opt(sbi, NORECOVERY); if (!f2fs_readonly(sb)) return -EINVAL; break; case Opt_discard: + if (!f2fs_hw_support_discard(sbi)) { + f2fs_warn(sbi, "device does not support discard"); + break; + } set_opt(sbi, DISCARD); break; case Opt_nodiscard: - if (f2fs_sb_has_blkzoned(sbi)) { + if (f2fs_hw_should_discard(sbi)) { f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } @@ -517,7 +780,8 @@ static int parse_options(struct super_block *sb, char *options) case Opt_active_logs: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + if (arg != 2 && arg != 4 && + arg != NR_CURSEG_PERSIST_TYPE) return -EINVAL; F2FS_OPTION(sbi).active_logs = arg; break; @@ -593,29 +857,31 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 8 && - !strncmp(name, "adaptive", 8)) { + if (!strcmp(name, "adaptive")) { if (f2fs_sb_has_blkzoned(sbi)) { f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature"); - kvfree(name); + kfree(name); return -EINVAL; } - set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); - } else if (strlen(name) == 3 && - !strncmp(name, "lfs", 3)) { - set_opt_mode(sbi, F2FS_MOUNT_LFS); + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + } else if (!strcmp(name, "lfs")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + } else if (!strcmp(name, "fragment:segment")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; + } else if (!strcmp(name, "fragment:block")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { f2fs_warn(sbi, "Not support %d, larger than %d", - 1 << arg, BIO_MAX_PAGES); + 1 << arg, BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; @@ -723,73 +989,49 @@ static int parse_options(struct super_block *sb, char *options) f2fs_info(sbi, "quota operations not supported"); break; #endif - case Opt_whint: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (strlen(name) == 10 && - !strncmp(name, "user-based", 10)) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (strlen(name) == 3 && - !strncmp(name, "off", 3)) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (strlen(name) == 8 && - !strncmp(name, "fs-based", 8)) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; - } else { - kvfree(name); - return -EINVAL; - } - kvfree(name); - break; case Opt_alloc: name = match_strdup(&args[0]); if (!name) return -ENOMEM; - if (strlen(name) == 7 && - !strncmp(name, "default", 7)) { + if (!strcmp(name, "default")) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; - } else if (strlen(name) == 5 && - !strncmp(name, "reuse", 5)) { + } else if (!strcmp(name, "reuse")) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_fsync: name = match_strdup(&args[0]); if (!name) return -ENOMEM; - if (strlen(name) == 5 && - !strncmp(name, "posix", 5)) { + if (!strcmp(name, "posix")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - } else if (strlen(name) == 6 && - !strncmp(name, "strict", 6)) { + } else if (!strcmp(name, "strict")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; - } else if (strlen(name) == 9 && - !strncmp(name, "nobarrier", 9)) { + } else if (!strcmp(name, "nobarrier")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_NOBARRIER; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - F2FS_OPTION(sbi).test_dummy_encryption = true; - f2fs_info(sbi, "Test dummy encryption mode enabled"); + ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + is_remount); + if (ret) + return ret; + break; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + sb->s_flags |= SB_INLINECRYPT; #else - f2fs_info(sbi, "Test dummy encryption mount option ignored"); + f2fs_info(sbi, "inline encryption not supported"); #endif break; case Opt_checkpoint_disable_cap_perc: @@ -797,12 +1039,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; if (arg < 0 || arg > 100) return -EINVAL; - if (arg == 100) - F2FS_OPTION(sbi).unusable_cap = - sbi->user_block_count; - else - F2FS_OPTION(sbi).unusable_cap = - (sbi->user_block_count / 100) * arg; + F2FS_OPTION(sbi).unusable_cap_perc = arg; set_opt(sbi, DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable_cap: @@ -817,21 +1054,61 @@ static int parse_options(struct super_block *sb, char *options) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; +#ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature if off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) return -ENOMEM; - if (strlen(name) == 3 && !strcmp(name, "lzo")) { + if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; - } else if (strlen(name) == 3 && - !strcmp(name, "lz4")) { +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif + } else if (!strncmp(name, "lz4", 3)) { +#ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif + } else if (!strncmp(name, "zstd", 4)) { +#ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif + } else if (!strcmp(name, "lzo-rle")) { +#ifdef CONFIG_F2FS_FS_LZORLE + F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZORLE; +#else + f2fs_info(sbi, "kernel doesn't support lzorle compression"); +#endif } else { kfree(name); return -EINVAL; @@ -840,8 +1117,8 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_compress_log_size: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } if (args->from && match_int(args, &arg)) return -EINVAL; @@ -855,8 +1132,8 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_compress_extension: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) @@ -877,12 +1154,112 @@ static int parse_options(struct super_block *sb, char *options) F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; + case Opt_nocompress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + strcpy(noext[noext_cnt], name); + F2FS_OPTION(sbi).nocompress_ext_cnt++; + kfree(name); + break; + case Opt_compress_chksum: + F2FS_OPTION(sbi).compress_chksum = true; + break; + case Opt_compress_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "fs")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + } else if (!strcmp(name, "user")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_compress_cache: + set_opt(sbi, COMPRESS_CACHE); + break; +#else + case Opt_compress_algorithm: + case Opt_compress_log_size: + case Opt_compress_extension: + case Opt_nocompress_extension: + case Opt_compress_chksum: + case Opt_compress_mode: + case Opt_compress_cache: + f2fs_info(sbi, "compression options not supported"); + break; +#endif + case Opt_atgc: + set_opt(sbi, ATGC); + break; + case Opt_gc_merge: + set_opt(sbi, GC_MERGE); + break; + case Opt_nogc_merge: + clear_opt(sbi, GC_MERGE); + break; + case Opt_discard_unit: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "block")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_BLOCK; + } else if (!strcmp(name, "segment")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SEGMENT; + } else if (!strcmp(name, "section")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_memory_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "normal")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_NORMAL; + } else if (!strcmp(name, "low")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_LOW; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } } +default_check: #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -896,15 +1273,41 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } #endif -#ifndef CONFIG_UNICODE +#if !IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); return -EINVAL; } #endif + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; + } +#endif + if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_OPTION(sbi).discard_unit != + DISCARD_UNIT_SECTION) { + f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } + } - if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_test_compress_extension(sbi)) { + f2fs_err(sbi, "invalid compress or nocompress extension"); + return -EINVAL; + } +#endif + + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", F2FS_IO_SIZE_KB(sbi)); return -EINVAL; @@ -934,16 +1337,20 @@ static int parse_options(struct super_block *sb, char *options) } } - if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) { - f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n"); + if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with checkpoint=disable"); return -EINVAL; } - /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_TYPE. - */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "Allow to mount readonly mode only"); + return -EROFS; + } return 0; } @@ -951,7 +1358,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); + if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC); + return NULL; + } + + fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; @@ -959,16 +1371,14 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); - init_rwsem(&fi->i_sem); + atomic_set(&fi->i_compr_blocks, 0); + init_f2fs_rwsem(&fi->i_sem); + spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); - init_rwsem(&fi->i_gc_rwsem[READ]); - init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_mmap_sem); - init_rwsem(&fi->i_xattr_sem); + init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); + init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); + init_f2fs_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -1006,9 +1416,7 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1091,9 +1499,6 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) inode->i_ino == F2FS_META_INO(sbi)) return; - if (flags == I_DIRTY_TIME) - return; - if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); @@ -1108,8 +1513,9 @@ static void f2fs_free_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); + percpu_counter_destroy(&sbi->rf_node_block_count); + percpu_counter_destroy(&sbi->alloc_valid_block_count); } static void destroy_device_list(struct f2fs_sb_info *sbi) @@ -1131,12 +1537,21 @@ static void f2fs_put_super(struct super_block *sb) int i; bool dropped; + /* unregister procfs/sysfs entries in advance to avoid race case */ + f2fs_unregister_sysfs(sbi); + f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do * clean checkpoint again. @@ -1172,10 +1587,12 @@ static void f2fs_put_super(struct super_block *sb) /* our cp_error case, we can wait for any writeback page */ f2fs_flush_merged_writes(sbi); - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); f2fs_bug_on(sbi, sbi->fsync_node_num); + f2fs_destroy_compress_inode(sbi); + iput(sbi->node_inode); sbi->node_inode = NULL; @@ -1196,26 +1613,28 @@ static void f2fs_put_super(struct super_block *sb) kvfree(sbi->ckpt); - f2fs_unregister_sysfs(sbi); - sb->s_fs_info = NULL; if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kvfree(sbi->raw_super); + kfree(sbi->raw_super); destroy_device_list(sbi); + f2fs_destroy_page_array_cache(sbi); + f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kvfree(F2FS_OPTION(sbi).s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); destroy_percpu_info(sbi); + f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); #endif - kvfree(sbi); + kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) @@ -1233,16 +1652,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) { - struct cp_control cpc; - - cpc.reason = __get_cp_reason(sbi); - - down_write(&sbi->gc_lock); - err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); - } - f2fs_trace_ios(NULL, 1); + if (sync) + err = f2fs_issue_checkpoint(sbi); return err; } @@ -1259,11 +1670,18 @@ static int f2fs_freeze(struct super_block *sb) /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; + + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(F2FS_SB(sb)); + + /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ + set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } static int f2fs_unfreeze(struct super_block *sb) { + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } @@ -1288,7 +1706,8 @@ static int f2fs_statfs_project(struct super_block *sb, limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { - curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? @@ -1318,18 +1737,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1342,20 +1766,17 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } buf->f_namelen = F2FS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); #ifdef CONFIG_QUOTA if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && @@ -1403,6 +1824,7 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +#ifdef CONFIG_F2FS_FS_COMPRESSION static inline void f2fs_show_compress_options(struct seq_file *seq, struct super_block *sb) { @@ -1420,9 +1842,18 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, case COMPRESS_LZ4: algtype = "lz4"; break; + case COMPRESS_ZSTD: + algtype = "zstd"; + break; + case COMPRESS_LZORLE: + algtype = "lzo-rle"; + break; } seq_printf(seq, ",compress_algorithm=%s", algtype); + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); @@ -1430,22 +1861,43 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, seq_printf(seq, ",compress_extension=%s", F2FS_OPTION(sbi).extensions[i]); } + + for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) { + seq_printf(seq, ",nocompress_extension=%s", + F2FS_OPTION(sbi).noextensions[i]); + } + + if (F2FS_OPTION(sbi).compress_chksum) + seq_puts(seq, ",compress_chksum"); + + if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_FS) + seq_printf(seq, ",compress_mode=%s", "fs"); + else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) + seq_printf(seq, ",compress_mode=%s", "user"); + + if (test_opt(sbi, COMPRESS_CACHE)) + seq_puts(seq, ",compress_cache"); } +#endif static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { - if (test_opt(sbi, FORCE_FG_GC)) - seq_printf(seq, ",background_gc=%s", "sync"); - else - seq_printf(seq, ",background_gc=%s", "on"); - } else { + if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) + seq_printf(seq, ",background_gc=%s", "sync"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON) + seq_printf(seq, ",background_gc=%s", "on"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); - } + + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); + if (test_opt(sbi, NORECOVERY)) + seq_puts(seq, ",norecovery"); if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); else @@ -1497,10 +1949,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",data_flush"); seq_puts(seq, ",mode="); - if (test_opt(sbi, ADAPTIVE)) + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE) seq_puts(seq, "adaptive"); - else if (test_opt(sbi, LFS)) + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) + seq_puts(seq, "fragment:segment"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", @@ -1531,14 +1987,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) - seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) - seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_FS_ENCRYPTION - if (F2FS_OPTION(sbi).test_dummy_encryption) - seq_puts(seq, ",test_dummy_encryption"); -#endif + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); + + if (sbi->sb->s_flags & SB_INLINECRYPT) + seq_puts(seq, ",inlinecrypt"); if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1548,6 +2001,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -1555,40 +2012,69 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) seq_printf(seq, ",fsync_mode=%s", "nobarrier"); +#ifdef CONFIG_F2FS_FS_COMPRESSION f2fs_show_compress_options(seq, sbi->sb); +#endif + + if (test_opt(sbi, ATGC)) + seq_puts(seq, ",atgc"); + + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + + if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) + seq_printf(seq, ",memory=%s", "normal"); + else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) + seq_printf(seq, ",memory=%s", "low"); + return 0; } static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + if (f2fs_sb_has_readonly(sbi)) + F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; + else + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; + + sbi->sb->s_flags &= ~SB_INLINECRYPT; - set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); + set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi)) - set_opt_mode(sbi, F2FS_MOUNT_LFS); - else - set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); + if (f2fs_sb_has_blkzoned(sbi)) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + } else { + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -1608,6 +2094,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -1618,11 +2105,25 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); + sbi->gc_mode = GC_URGENT_HIGH; + while (!f2fs_time_over(sbi, DISABLE_TIME)) { - down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 1 }; + + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; @@ -1633,7 +2134,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) ret = sync_filesystem(sbi->sb); if (ret || err) { - err = ret ? ret: err; + err = ret ? ret : err; goto restore_flag; } @@ -1643,7 +2144,8 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } - down_write(&sbi->gc_lock); +skip_gc: + f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); err = f2fs_write_checkpoint(sbi, &cpc); @@ -1655,22 +2157,37 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); out_unlock: - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); restore_flag: - sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sbi->gc_mode = gc_mode; + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - down_write(&sbi->gc_lock); + int retry = DEFAULT_RETRY_IO_COUNT; + + /* we should flush all the data to keep data consistency */ + do { + sync_inodes_sb(sbi->sb); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); + + if (unlikely(retry < 0)) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); + + f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -1679,12 +2196,18 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; int err; - bool need_restart_gc = false; - bool need_stop_gc = false; + bool need_restart_gc = false, need_stop_gc = false; + bool need_restart_ckpt = false, need_stop_ckpt = false; + bool need_restart_flush = false, need_stop_flush = false; + bool need_restart_discard = false, need_stop_discard = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); + bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); - bool checkpoint_changed; + bool no_atgc = !test_opt(sbi, ATGC); + bool no_discard = !test_opt(sbi, DISCARD); + bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); + bool block_unit_discard = f2fs_block_unit_discard(sbi); + struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -1705,7 +2228,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) GFP_KERNEL); if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kvfree(org_mount_opt.s_qf_names[j]); + kfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { @@ -1726,11 +2249,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi); /* parse mount options */ - err = parse_options(sb, data); + err = parse_options(sb, data, true); if (err) goto restore_opts; - checkpoint_changed = - disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT); /* * Previous and new state of filesystem is RO, @@ -1739,6 +2260,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; + if (f2fs_sb_has_readonly(sbi) && !(*flags & SB_RDONLY)) { + err = -EROFS; + goto restore_opts; + } + #ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); @@ -1756,6 +2282,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif + /* disallow enable atgc dynamically */ + if (no_atgc == !!test_opt(sbi, ATGC)) { + err = -EINVAL; + f2fs_warn(sbi, "switch atgc option is not allowed"); + goto restore_opts; + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1769,6 +2302,18 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch compress_cache option is not allowed"); + goto restore_opts; + } + + if (block_unit_discard != f2fs_block_unit_discard(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch discard_unit option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -1780,7 +2325,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) { + if ((*flags & SB_RDONLY) || + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -1792,9 +2339,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY || - F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { - writeback_inodes_sb(sb, WB_REASON_SYNC); + if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1803,14 +2348,22 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } - if (checkpoint_changed) { - if (test_opt(sbi, DISABLE_CHECKPOINT)) { - err = f2fs_disable_checkpoint(sbi); - if (err) - goto restore_gc; - } else { - f2fs_enable_checkpoint(sbi); + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + need_restart_ckpt = true; + } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; } + need_stop_ckpt = true; } /* @@ -1820,24 +2373,75 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); + need_restart_flush = true; } else { err = f2fs_create_flush_cmd_control(sbi); if (err) - goto restore_gc; + goto restore_ckpt; + need_stop_flush = true; } + + if (no_discard == !!test_opt(sbi, DISCARD)) { + if (test_opt(sbi, DISCARD)) { + err = f2fs_start_discard_thread(sbi); + if (err) + goto restore_flush; + need_stop_discard = true; + } else { + dcc = SM_I(sbi)->dcc_info; + f2fs_stop_discard_thread(sbi); + if (atomic_read(&dcc->discard_cmd_cnt)) + f2fs_issue_discard_timeout(sbi); + need_restart_discard = true; + } + } + + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_discard; + } else { + f2fs_enable_checkpoint(sbi); + } + } + skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kvfree(org_mount_opt.s_qf_names[i]); + kfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); + adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_discard: + if (need_restart_discard) { + if (f2fs_start_discard_thread(sbi)) + f2fs_warn(sbi, "discard has been stopped"); + } else if (need_stop_discard) { + f2fs_stop_discard_thread(sbi); + } +restore_flush: + if (need_restart_flush) { + if (f2fs_create_flush_cmd_control(sbi)) + f2fs_warn(sbi, "background flush thread has stopped"); + } else if (need_stop_flush) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } +restore_ckpt: + if (need_restart_ckpt) { + if (f2fs_start_ckpt_thread(sbi)) + f2fs_warn(sbi, "background ckpt thread has stopped"); + } else if (need_stop_ckpt) { + f2fs_stop_ckpt_thread(sbi); + } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) @@ -1849,7 +2453,7 @@ restore_opts: #ifdef CONFIG_QUOTA F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kvfree(F2FS_OPTION(sbi).s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif @@ -1871,7 +2475,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t toread; loff_t i_size = i_size_read(inode); struct page *page; - char *kaddr; if (off > i_size) return 0; @@ -1885,7 +2488,7 @@ repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_NOFS); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -1904,9 +2507,7 @@ repeat: return -EIO; } - kaddr = kmap_atomic(page); - memcpy(data, kaddr + offset, tocopy); - kunmap_atomic(kaddr); + memcpy_from_page(data, page, offset, tocopy); f2fs_put_page(page, 1); offset = 0; @@ -1927,7 +2528,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, int offset = off & (sb->s_blocksize - 1); size_t towrite = len; struct page *page; - char *kaddr; + void *fsdata = NULL; int err = 0; int tocopy; @@ -1935,24 +2536,21 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); retry: - err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, - &page, NULL); + err = a_ops->write_begin(NULL, mapping, off, tocopy, + &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); break; } - kaddr = kmap_atomic(page); - memcpy(kaddr + offset, data, tocopy); - kunmap_atomic(kaddr); - flush_dcache_page(page); + memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, - page, NULL); + page, fsdata); offset = 0; towrite -= tocopy; off += tocopy; @@ -1967,6 +2565,16 @@ retry: return len - towrite; } +int f2fs_dquot_initialize(struct inode *inode) +{ + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) { + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT); + return -ESRCH; + } + + return dquot_initialize(inode); +} + static struct dquot **f2fs_get_dquots(struct inode *inode) { return F2FS_I(inode)->i_dquot; @@ -2080,64 +2688,78 @@ static int f2fs_enable_quotas(struct super_block *sb) return 0; } -int f2fs_quota_sync(struct super_block *sb, int type) +static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); - struct quota_info *dqopt = sb_dqopt(sb); - int cnt; - int ret; + struct quota_info *dqopt = sb_dqopt(sbi->sb); + struct address_space *mapping = dqopt->files[type]->i_mapping; + int ret = 0; - /* - * do_quotactl - * f2fs_quota_sync - * down_read(quota_sem) - * dquot_writeback_dquots() - * f2fs_dquot_commit - * block_operation - * down_read(quota_sem) - */ - f2fs_lock_op(sbi); + ret = dquot_writeback_dquots(sbi->sb, type); + if (ret) + goto out; - down_read(&sbi->quota_sem); - ret = dquot_writeback_dquots(sb, type); + ret = filemap_fdatawrite(mapping); if (ret) goto out; + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + goto out; + + ret = filemap_fdatawait(mapping); + + truncate_inode_pages(&dqopt->files[type]->i_data, 0); +out: + if (ret) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + +int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret = 0; + /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - struct address_space *mapping; if (type != -1 && cnt != type) continue; + if (!sb_has_quota_active(sb, cnt)) continue; - mapping = dqopt->files[cnt]->i_mapping; + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); - ret = filemap_fdatawrite(mapping); - if (ret) - goto out; + /* + * do_quotactl + * f2fs_quota_sync + * f2fs_down_read(quota_sem) + * dquot_writeback_dquots() + * f2fs_dquot_commit + * block_operation + * f2fs_down_read(quota_sem) + */ + f2fs_lock_op(sbi); + f2fs_down_read(&sbi->quota_sem); - /* if we are using journalled quota */ - if (is_journalled_quota(sbi)) - continue; + ret = f2fs_quota_sync_file(sbi, cnt); - ret = filemap_fdatawait(mapping); - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + f2fs_up_read(&sbi->quota_sem); + f2fs_unlock_op(sbi); + + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); - inode_lock(dqopt->files[cnt]); - truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); - inode_unlock(dqopt->files[cnt]); + if (ret) + break; } -out: - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); - f2fs_unlock_op(sbi); return ret; } @@ -2255,11 +2877,11 @@ static int f2fs_dquot_commit(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); + f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -2268,11 +2890,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read(&sbi->quota_sem); + f2fs_down_read(&sbi->quota_sem); ret = dquot_acquire(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -2339,6 +2961,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else +int f2fs_dquot_initialize(struct inode *inode) +{ + return 0; +} + int f2fs_quota_sync(struct super_block *sb, int type) { return 0; @@ -2398,9 +3025,9 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static bool f2fs_dummy_context(struct inode *inode) +static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2415,15 +3042,35 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, *lblk_bits_ret = 8 * sizeof(block_t); } +static struct block_device **f2fs_get_devices(struct super_block *sb, + unsigned int *num_devs) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct block_device **devs; + int i; + + if (!f2fs_is_multi_device(sbi)) + return NULL; + + devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < sbi->s_ndevs; i++) + devs[i] = FDEV(i).bdev; + *num_devs = sbi->s_ndevs; + return devs; +} + static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .dummy_context = f2fs_dummy_context, + .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, - .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, + .get_devices = f2fs_get_devices, }; #endif @@ -2472,10 +3119,10 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_blocks(void) +loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; - loff_t leaf_count = DEF_ADDRS_PER_BLOCK; + loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - @@ -2484,6 +3131,11 @@ static loff_t max_file_blocks(void) * result as zero. */ + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + /* two direct node blocks */ result += (leaf_count * 2); @@ -2574,10 +3226,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, } if (main_end_blkaddr > seg_end_blkaddr) { - f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", - main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)", + main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); return true; } else if (main_end_blkaddr < seg_end_blkaddr) { @@ -2595,10 +3245,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, err = __f2fs_commit_super(bh, NULL); res = err ? "failed" : "done"; } - f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%u) block(%u)", - res, main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)", + res, main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); if (err) return true; @@ -2609,11 +3257,10 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { - block_t segment_count, segs_per_sec, secs_per_zone; + block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main; block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); - unsigned int blocksize; size_t crc_offset = 0; __u32 crc = 0; @@ -2639,18 +3286,11 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } } - /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_SIZE) { - f2fs_info(sbi, "Invalid page_cache_size (%lu), supports only 4KB", - PAGE_SIZE); - return -EFSCORRUPTED; - } - /* Currently, support only 4KB block size */ - blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != F2FS_BLKSIZE) { - f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB", - blocksize); + if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { + f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", + le32_to_cpu(raw_super->log_blocksize), + F2FS_BLKSIZE_BITS); return -EFSCORRUPTED; } @@ -2680,6 +3320,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } segment_count = le32_to_cpu(raw_super->segment_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); total_sections = le32_to_cpu(raw_super->section_count); @@ -2693,14 +3334,19 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (total_sections > segment_count || - total_sections < F2FS_MIN_SEGMENTS || + if (total_sections > segment_count_main || total_sections < 1 || segs_per_sec > segment_count || !segs_per_sec) { f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)", segment_count, total_sections, segs_per_sec); return -EFSCORRUPTED; } + if (segment_count_main != total_sections * segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)", + segment_count_main, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + if ((segment_count / segs_per_sec) < total_sections) { f2fs_info(sbi, "Small segment_count (%u < %u * %u)", segment_count, segs_per_sec, total_sections); @@ -2726,6 +3372,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, segment_count, dev_seg_count); return -EFSCORRUPTED; } + } else { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) && + !bdev_is_zoned(sbi->sb->s_bdev)) { + f2fs_info(sbi, "Zoned block device path is missing"); + return -EFSCORRUPTED; + } } if (secs_per_zone > total_sections || !secs_per_zone) { @@ -2744,11 +3396,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (le32_to_cpu(raw_super->cp_payload) > - (blocks_per_seg - F2FS_CP_PACKS)) { - f2fs_info(sbi, "Insane cp_payload (%u > %u)", + if (le32_to_cpu(raw_super->cp_payload) >= + (blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE)) { + f2fs_info(sbi, "Insane cp_payload (%u >= %u)", le32_to_cpu(raw_super->cp_payload), - blocks_per_seg - F2FS_CP_PACKS); + blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE); return -EFSCORRUPTED; } @@ -2784,6 +3438,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count, valid_user_blocks; block_t avail_node_count, valid_node_count; + unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; int i, j; total = le32_to_cpu(raw_super->segment_count); @@ -2801,14 +3456,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); - if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + if (!f2fs_sb_has_readonly(sbi) && + unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; } - user_block_count = le64_to_cpu(ckpt->user_block_count); - segment_count_main = le32_to_cpu(raw_super->segment_count_main); + segment_count_main = le32_to_cpu(raw_super->segment_count_main) + + (f2fs_sb_has_readonly(sbi) ? 1 : 0); log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); if (!user_block_count || user_block_count >= segment_count_main << log_blocks_per_seg) { @@ -2839,6 +3495,10 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto check_data; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) == le32_to_cpu(ckpt->cur_node_segno[j])) { @@ -2849,10 +3509,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } +check_data: for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto skip_cross; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) == le32_to_cpu(ckpt->cur_data_segno[j])) { @@ -2874,7 +3539,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } - +skip_cross: sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); @@ -2889,7 +3554,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) cp_payload = __cp_payload(sbi); if (cp_pack_start_sum < cp_payload + 1 || cp_pack_start_sum > blocks_per_seg - 1 - - NR_CURSEG_TYPE) { + NR_CURSEG_PERSIST_TYPE) { f2fs_err(sbi, "Wrong cp_pack_start_sum: %u", cp_pack_start_sum); return 1; @@ -2904,6 +3569,17 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + nat_blocks = nat_segs << log_blocks_per_seg; + nat_bits_bytes = nat_blocks / BITS_PER_BYTE; + nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) && + (cp_payload + F2FS_CP_PACKS + + NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { + f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", + cp_payload, nat_bits_blocks); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_err(sbi, "A bug case: need to run fsck"); return 1; @@ -2928,14 +3604,20 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->total_node_count = (le32_to_cpu(raw_super->segment_count_nat) / 2) * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; - sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); - sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); - sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); + F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); + F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->gc_mode = GC_NORMAL; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; + sbi->seq_file_ra_mul = MIN_RA_MUL; + sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; + sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_urgent_high_lock); + atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -2955,14 +3637,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - init_rwsem(&sbi->io_order_lock); + init_f2fs_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); - init_rwsem(&sbi->sb_lock); - init_rwsem(&sbi->pin_sem); + init_f2fs_rwsem(&sbi->sb_lock); + init_f2fs_rwsem(&sbi->pin_sem); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -2973,60 +3655,96 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) if (err) return err; + err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL); + if (err) + goto err_valid_block; + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); if (err) - percpu_counter_destroy(&sbi->alloc_valid_block_count); + goto err_node_block; + return 0; +err_node_block: + percpu_counter_destroy(&sbi->rf_node_block_count); +err_valid_block: + percpu_counter_destroy(&sbi->alloc_valid_block_count); return err; } #ifdef CONFIG_BLK_DEV_ZONED + +struct f2fs_report_zones_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *dev; +}; + static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) + void *data) { - struct f2fs_dev_info *dev = data; + struct f2fs_report_zones_args *rz_args = data; + block_t unusable_blocks = (zone->len - zone->capacity) >> + F2FS_LOG_SECTORS_PER_BLOCK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return 0; - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) - set_bit(idx, dev->blkz_seq); + set_bit(idx, rz_args->dev->blkz_seq); + if (!rz_args->sbi->unusable_blocks_per_sec) { + rz_args->sbi->unusable_blocks_per_sec = unusable_blocks; + return 0; + } + if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) { + f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n"); + return -EINVAL; + } return 0; } static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; - sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t nr_sectors = bdev_nr_sectors(bdev); + struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); + return -EINVAL; + } + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; - FDEV(devi).blkz_seq = f2fs_kzalloc(sbi, + FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, BITS_TO_LONGS(FDEV(devi).nr_blkz) * sizeof(unsigned long), GFP_KERNEL); if (!FDEV(devi).blkz_seq) return -ENOMEM; - /* Get block zones type */ + rep_zone_arg.sbi = sbi; + rep_zone_arg.dev = &FDEV(devi); + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, - &FDEV(devi)); + &rep_zone_arg); if (ret < 0) return ret; - return 0; } #endif @@ -3082,7 +3800,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, /* No valid superblock */ if (!*raw_super) - kvfree(super); + kfree(super); else err = 0; @@ -3128,10 +3846,73 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + f2fs_up_write(&sbi->sb_lock); +} + +static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: + f2fs_up_write(&sbi->sb_lock); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; + unsigned int logical_blksize; int i; /* Initialize single device information */ @@ -3152,6 +3933,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (!sbi->devs) return -ENOMEM; + logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); + sbi->aligned_blksize = true; + for (i = 0; i < max_devices; i++) { if (i > 0 && !RDEV(i).path[0]) @@ -3188,10 +3972,13 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* to release errored devices */ sbi->s_ndevs = i + 1; + if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) + sbi->aligned_blksize = false; + #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device feature not enabled\n"); + f2fs_err(sbi, "Zoned block device feature not enabled"); return -EINVAL; } if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { @@ -3222,41 +4009,41 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { -#ifdef CONFIG_UNICODE - if (f2fs_sb_has_casefold(sbi) && !sbi->s_encoding) { +#if IS_ENABLED(CONFIG_UNICODE) + if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; - if (f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, - "Can't mount with encoding and encryption"); - return -EINVAL; - } - - if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info, - &encoding_flags)) { + encoding_info = f2fs_sb_read_encoding(sbi->raw_super); + if (!encoding_info) { f2fs_err(sbi, "Encoding requested by superblock is unknown"); return -EINVAL; } + encoding_flags = le16_to_cpu(sbi->raw_super->s_encoding_flags); encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { f2fs_err(sbi, - "can't mount with superblock charset: %s-%s " + "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", - encoding_info->name, encoding_info->version, + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), encoding_flags); return PTR_ERR(encoding); } f2fs_info(sbi, "Using encoding defined by superblock: " - "%s-%s with flags 0x%hx", encoding_info->name, - encoding_info->version?:"\b", encoding_flags); + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); - sbi->s_encoding = encoding; - sbi->s_encoding_flags = encoding_flags; - sbi->sb->s_d_op = &f2fs_dentry_ops; + sbi->sb->s_encoding = encoding; + sbi->sb->s_encoding_flags = encoding_flags; } #else if (f2fs_sb_has_casefold(sbi)) { @@ -3274,8 +4061,10 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; - sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + if (f2fs_block_unit_discard(sbi)) + sm_i->dcc_info->discard_granularity = 1; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + 1 << F2FS_IPU_HONOR_OPU_WRITE; } sbi->readdir_ra = 1; @@ -3334,18 +4123,6 @@ try_onemore: sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); - /* - * The BLKZONED feature indicates that the drive was formatted with - * zone alignment optimization. This is optional for host-aware - * devices, but mandatory for host-managed zoned block devices. - */ -#ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device support is not enabled"); - err = -EOPNOTSUPP; - goto free_sb_buf; - } -#endif default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -3354,12 +4131,11 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options); + err = parse_options(sb, options, false); if (err) goto free_options; - sbi->max_file_blocks = max_file_blocks(); - sb->s_maxbytes = sbi->max_file_blocks << + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; @@ -3398,55 +4174,33 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_rwsem(&sbi->gc_lock); + init_f2fs_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); - mutex_init(&sbi->cp_mutex); - mutex_init(&sbi->resize_mutex); - init_rwsem(&sbi->node_write); - init_rwsem(&sbi->node_change); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - /* init iostat info */ - spin_lock_init(&sbi->iostat_lock); - sbi->iostat_enable = false; - - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1: NR_TEMP_TYPE; - int j; - - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; - init_rwsem(&sbi->cp_rwsem); - init_rwsem(&sbi->quota_sem); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); - err = init_percpu_info(sbi); + err = f2fs_init_iostat(sbi); if (err) goto free_bio_info; + err = init_percpu_info(sbi); + if (err) + goto free_iostat; + if (F2FS_IO_ALIGNED(sbi)) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); @@ -3456,12 +4210,20 @@ try_onemore: } } + /* init per sbi slab cache */ + err = f2fs_init_xattr_caches(sbi); + if (err) + goto free_io_dummy; + err = f2fs_init_page_array_cache(sbi); + if (err) + goto free_xattr_cache; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_io_dummy; + goto free_page_array_cache; } err = f2fs_get_valid_checkpoint(sbi); @@ -3493,6 +4255,9 @@ try_onemore: goto free_devices; } + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -3504,6 +4269,7 @@ try_onemore: sbi->reserved_blocks = 0; sbi->current_reserved_blocks = 0; limit_reserve_root(sbi); + adjust_unusable_cap_perc(sbi); for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -3517,6 +4283,19 @@ try_onemore: f2fs_init_fsync_node_info(sbi); + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -3531,11 +4310,12 @@ try_onemore: goto free_nm; } + err = adjust_reserved_segment(sbi); + if (err) + goto free_nm; + /* For write statistics */ - if (sb->s_bdev->bd_part) - sbi->sectors_written_start = - (u64)part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]); + sbi->sectors_written_start = f2fs_get_sectors_written(sbi); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); @@ -3577,10 +4357,14 @@ try_onemore: goto free_node_inode; } - err = f2fs_register_sysfs(sbi); + err = f2fs_init_compress_inode(sbi); if (err) goto free_root_inode; + err = f2fs_register_sysfs(sbi); + if (err) + goto free_compress_inode; + #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -3589,7 +4373,7 @@ try_onemore: f2fs_err(sbi, "Cannot turn on quotas: error %d", err); } #endif - /* if there are nt orphan nodes free them */ + /* if there are any orphan inodes, free them */ err = f2fs_recover_orphan_inodes(sbi); if (err) goto free_meta; @@ -3598,16 +4382,23 @@ try_onemore: goto reset_checkpoint; /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && + !test_opt(sbi, NORECOVERY)) { /* * mount should be failed, when device has readonly mode, and * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - err = -EROFS; - f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - goto free_meta; + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + err = -EROFS; + f2fs_err(sbi, "Need to recover fsync data, but " + "write access unavailable, please try " + "mount w/ disable_roll_forward or norecovery"); + } + if (err < 0) + goto free_meta; } f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; @@ -3649,6 +4440,8 @@ try_onemore: } reset_checkpoint: + f2fs_init_inmem_curseg(sbi); + /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3664,7 +4457,8 @@ reset_checkpoint: * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) @@ -3711,6 +4505,8 @@ free_meta: /* evict some inodes being cached by GC */ evict_inodes(sb); f2fs_unregister_sysfs(sbi); +free_compress_inode: + f2fs_destroy_compress_inode(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -3722,10 +4518,14 @@ free_node_inode: free_stats: f2fs_destroy_stats(sbi); free_nm: + /* stop discard thread before destroying node manager */ + f2fs_stop_discard_thread(sbi); f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); f2fs_destroy_post_read_wq(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -3733,29 +4533,37 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); sbi->meta_inode = NULL; +free_page_array_cache: + f2fs_destroy_page_array_cache(sbi); +free_xattr_cache: + f2fs_destroy_xattr_caches(sbi); free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: destroy_percpu_info(sbi); +free_iostat: + f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); -#ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); + sb->s_encoding = NULL; #endif free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kvfree(F2FS_OPTION(sbi).s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); kvfree(options); free_sb_buf: - kvfree(raw_super); + kfree(raw_super); free_sbi: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kvfree(sbi); + kfree(sbi); /* give only one another chance */ if (retry_cnt > 0 && skip_recovery) { @@ -3781,6 +4589,15 @@ static void kill_f2fs_super(struct super_block *sb) f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * latter evict_inode() can bypass checking and invalidating + * compress inode cache. + */ + if (test_opt(sbi, COMPRESS_CACHE)) + truncate_inode_pages_final(COMPRESS_MAPPING(sbi)); +#endif + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { @@ -3800,7 +4617,7 @@ static struct file_system_type f2fs_fs_type = { .name = "f2fs", .mount = f2fs_mount, .kill_sb = kill_f2fs_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("f2fs"); @@ -3834,8 +4651,6 @@ static int __init init_f2fs_fs(void) return -EINVAL; } - f2fs_build_trace_ios(); - err = init_inodecache(); if (err) goto fail; @@ -3848,13 +4663,19 @@ static int __init init_f2fs_fs(void) err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; - err = f2fs_create_extent_cache(); + err = f2fs_create_recovery_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_init_sysfs(); + err = f2fs_create_extent_cache(); + if (err) + goto free_recovery_cache; + err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; - err = register_shrinker(&f2fs_shrinker_info); + err = f2fs_init_sysfs(); + if (err) + goto free_garbage_collection_cache; + err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); @@ -3864,15 +4685,35 @@ static int __init init_f2fs_fs(void) err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; - err = f2fs_init_bio_entry_cache(); + err = f2fs_init_iostat_processing(); if (err) goto free_post_read; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_iostat; err = f2fs_init_bioset(); if (err) goto free_bio_enrty_cache; + err = f2fs_init_compress_mempool(); + if (err) + goto free_bioset; + err = f2fs_init_compress_cache(); + if (err) + goto free_compress_mempool; + err = f2fs_create_casefold_cache(); + if (err) + goto free_compress_cache; return 0; +free_compress_cache: + f2fs_destroy_compress_cache(); +free_compress_mempool: + f2fs_destroy_compress_mempool(); +free_bioset: + f2fs_destroy_bioset(); free_bio_enrty_cache: f2fs_destroy_bio_entry_cache(); +free_iostat: + f2fs_destroy_iostat_processing(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: @@ -3882,8 +4723,12 @@ free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: f2fs_exit_sysfs(); +free_garbage_collection_cache: + f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); +free_recovery_cache: + f2fs_destroy_recovery_cache(); free_checkpoint_caches: f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: @@ -3898,19 +4743,24 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_casefold_cache(); + f2fs_destroy_compress_cache(); + f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); + f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); f2fs_exit_sysfs(); + f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); + f2fs_destroy_recovery_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); - f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) @@ -3919,4 +4769,5 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32"); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 91d649790b1b..df27afd71ef4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,10 +11,14 @@ #include <linux/f2fs_fs.h> #include <linux/seq_file.h> #include <linux/unicode.h> +#include <linux/ioprio.h> +#include <linux/sysfs.h> #include "f2fs.h" #include "segment.h" #include "gc.h" +#include "iostat.h" +#include <trace/events/f2fs.h> static struct proc_dir_entry *f2fs_proc_root; @@ -26,13 +30,25 @@ enum { NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ #ifdef CONFIG_F2FS_STAT_FS - STAT_INFO, /* struct f2fs_stat_info */ + STAT_INFO, /* struct f2fs_stat_info */ #endif #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ + ATGC_INFO, /* struct atgc_management */ +}; + +static const char *gc_mode_names[MAX_GC_MODE] = { + "GC_NORMAL", + "GC_IDLE_CB", + "GC_IDLE_GREEDY", + "GC_IDLE_AT", + "GC_URGENT_HIGH", + "GC_URGENT_LOW", + "GC_URGENT_MID" }; struct f2fs_attr { @@ -69,6 +85,10 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; + else if (struct_type == ATGC_INFO) + return (unsigned char *)&sbi->am; return NULL; } @@ -86,70 +106,93 @@ static ssize_t free_segments_show(struct f2fs_attr *a, (unsigned long long)(free_segments(sbi))); } -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, +static ssize_t ovp_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); + return sprintf(buf, "%llu\n", + (unsigned long long)(overprovision_segments(sbi))); +} +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ return sprintf(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); + ((f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1))); +} + +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; int len = 0; - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - if (f2fs_sb_has_encrypt(sbi)) - len += snprintf(buf, PAGE_SIZE - len, "%s", + len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); if (f2fs_sb_has_blkzoned(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "extra_attr"); if (f2fs_sb_has_project_quota(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "projquota"); if (f2fs_sb_has_inode_chksum(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); if (f2fs_sb_has_flexible_inline_xattr(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); if (f2fs_sb_has_quota_ino(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); if (f2fs_sb_has_inode_crtime(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); if (f2fs_sb_has_lost_found(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); if (f2fs_sb_has_verity(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "verity"); if (f2fs_sb_has_sb_chksum(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); if (f2fs_sb_has_casefold(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + if (f2fs_sb_has_readonly(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "readonly"); if (f2fs_sb_has_compression(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "compression"); - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "pin_file"); - len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -174,17 +217,24 @@ static ssize_t unusable_show(struct f2fs_attr *a, static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { -#ifdef CONFIG_UNICODE +#if IS_ENABLED(CONFIG_UNICODE) + struct super_block *sb = sbi->sb; + if (f2fs_sb_has_casefold(sbi)) - return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", - sbi->s_encoding->charset, - (sbi->s_encoding->version >> 16) & 0xff, - (sbi->s_encoding->version >> 8) & 0xff, - sbi->s_encoding->version & 0xff); + return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n", + (sb->s_encoding->version >> 16) & 0xff, + (sb->s_encoding->version >> 8) & 0xff, + sb->s_encoding->version & 0xff); #endif return sprintf(buf, "(none)"); } +static ssize_t mounted_time_sec_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); +} + #ifdef CONFIG_F2FS_STAT_FS static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) @@ -216,6 +266,13 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, } #endif +static ssize_t main_blkaddr_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", + (unsigned long long)MAIN_BLKADDR(sbi)); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -233,20 +290,76 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, int hot_count = sbi->raw_super->hot_ext_count; int len = 0, i; - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "cold file extension:\n"); for (i = 0; i < cold_count; i++) - len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "hot file extension:\n"); for (i = cold_count; i < cold_count + hot_count; i++) - len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); return len; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_written_block); + + if (!strcmp(a->attr.name, "compr_saved_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block); + + if (!strcmp(a->attr.name, "compr_new_inode")) + return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); +#endif + + if (!strcmp(a->attr.name, "gc_urgent")) + return sysfs_emit(buf, "%s\n", + gc_mode_names[sbi->gc_mode]); + + if (!strcmp(a->attr.name, "gc_segment_mode")) + return sysfs_emit(buf, "%s\n", + gc_mode_names[sbi->gc_segment_mode]); + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + return sysfs_emit(buf, "%u\n", + sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); + } + + if (!strcmp(a->attr.name, "current_atomic_write")) { + s64 current_write = atomic64_read(&sbi->current_atomic_write); + + return sysfs_emit(buf, "%lld\n", current_write); + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) + return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write); + + if (!strcmp(a->attr.name, "committed_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block); + + if (!strcmp(a->attr.name, "revoked_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block); + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -283,10 +396,10 @@ static ssize_t __sbi_store(struct f2fs_attr *a, set = false; } - if (strlen(name) >= F2FS_EXTENSION_LEN) + if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) @@ -296,10 +409,42 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (ret) f2fs_update_extension_list(sbi, name, hot, !set); out: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); return ret ? ret : count; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_NR_LEVELS || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -314,7 +459,9 @@ out: if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks - + sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -328,6 +475,8 @@ out: if (!strcmp(a->attr.name, "discard_granularity")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; if (t == *ui) return count; *ui = t; @@ -343,30 +492,54 @@ out: return -EINVAL; if (!strcmp(a->attr.name, "gc_urgent")) { - if (t >= 1) { - sbi->gc_mode = GC_URGENT; + if (t == 0) { + sbi->gc_mode = GC_NORMAL; + } else if (t == 1) { + sbi->gc_mode = GC_URGENT_HIGH; if (sbi->gc_thread) { sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); } + } else if (t == 2) { + sbi->gc_mode = GC_URGENT_LOW; + } else if (t == 3) { + sbi->gc_mode = GC_URGENT_MID; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + } } else { - sbi->gc_mode = GC_NORMAL; + return -EINVAL; } return count; } if (!strcmp(a->attr.name, "gc_idle")) { - if (t == GC_IDLE_CB) + if (t == GC_IDLE_CB) { sbi->gc_mode = GC_IDLE_CB; - else if (t == GC_IDLE_GREEDY) + } else if (t == GC_IDLE_GREEDY) { sbi->gc_mode = GC_IDLE_GREEDY; - else + } else if (t == GC_IDLE_AT) { + if (!sbi->am.atgc_enabled) + return -EINVAL; + sbi->gc_mode = GC_IDLE_AT; + } else { sbi->gc_mode = GC_NORMAL; + } return count; } + if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { + spin_lock(&sbi->gc_urgent_high_lock); + sbi->gc_urgent_high_remaining = t; + spin_unlock(&sbi->gc_urgent_high_lock); + return count; + } + +#ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; if (!sbi->iostat_enable) @@ -374,6 +547,108 @@ out: return count; } + if (!strcmp(a->attr.name, "iostat_period_ms")) { + if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS) + return -EINVAL; + spin_lock(&sbi->iostat_lock); + sbi->iostat_period_ms = (unsigned int)t; + spin_unlock(&sbi->iostat_lock); + return count; + } +#endif + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block") || + !strcmp(a->attr.name, "compr_saved_block")) { + if (t != 0) + return -EINVAL; + sbi->compr_written_block = 0; + sbi->compr_saved_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "compr_new_inode")) { + if (t != 0) + return -EINVAL; + sbi->compr_new_inode = 0; + return count; + } +#endif + + if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { + if (t > 100) + return -EINVAL; + sbi->am.candidate_ratio = t; + return count; + } + + if (!strcmp(a->attr.name, "atgc_age_weight")) { + if (t > 100) + return -EINVAL; + sbi->am.age_weight = t; + return count; + } + + if (!strcmp(a->attr.name, "gc_segment_mode")) { + if (t < MAX_GC_MODE) + sbi->gc_segment_mode = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + if (t != 0) + return -EINVAL; + sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0; + return count; + } + + if (!strcmp(a->attr.name, "seq_file_ra_mul")) { + if (t >= MIN_RA_MUL && t <= MAX_RA_MUL) + sbi->seq_file_ra_mul = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_chunk")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_chunk = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_hole")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_hole = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) { + if (t != 0) + return -EINVAL; + sbi->peak_atomic_write = 0; + return count; + } + + if (!strcmp(a->attr.name, "committed_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->committed_atomic_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "revoked_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->revoked_atomic_block = 0; + return count; + } + *ui = (unsigned int)t; return count; @@ -425,44 +700,49 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -enum feat_id { - FEAT_CRYPTO = 0, - FEAT_BLKZONED, - FEAT_ATOMIC_WRITE, - FEAT_EXTRA_ATTR, - FEAT_PROJECT_QUOTA, - FEAT_INODE_CHECKSUM, - FEAT_FLEXIBLE_INLINE_XATTR, - FEAT_QUOTA_INO, - FEAT_INODE_CRTIME, - FEAT_LOST_FOUND, - FEAT_VERITY, - FEAT_SB_CHECKSUM, - FEAT_CASEFOLD, - FEAT_COMPRESSION, -}; - +/* + * Note that there are three feature list entries: + * 1) /sys/fs/f2fs/features + * : shows runtime features supported by in-kernel f2fs along with Kconfig. + * - ref. F2FS_FEATURE_RO_ATTR() + * + * 2) /sys/fs/f2fs/$s_id/features <deprecated> + * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This + * won't add new feature anymore, and thus, users should check entries in 3) + * instead of this 2). + * + * 3) /sys/fs/f2fs/$s_id/feature_list + * : shows on-disk features enabled by mkfs.f2fs per instance, which follows + * sysfs entry rule where each entry should expose single value. + * This list covers old feature list provided by 2) and beyond. Therefore, + * please add new on-disk feature in this list only. + * - ref. F2FS_SB_FEATURE_RO_ATTR() + */ static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - switch (a->id) { - case FEAT_CRYPTO: - case FEAT_BLKZONED: - case FEAT_ATOMIC_WRITE: - case FEAT_EXTRA_ATTR: - case FEAT_PROJECT_QUOTA: - case FEAT_INODE_CHECKSUM: - case FEAT_FLEXIBLE_INLINE_XATTR: - case FEAT_QUOTA_INO: - case FEAT_INODE_CRTIME: - case FEAT_LOST_FOUND: - case FEAT_VERITY: - case FEAT_SB_CHECKSUM: - case FEAT_CASEFOLD: - case FEAT_COMPRESSION: + return sprintf(buf, "supported\n"); +} + +#define F2FS_FEATURE_RO_ATTR(_name) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ +} + +static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (F2FS_HAS_FEATURE(sbi, a->id)) return sprintf(buf, "supported\n"); - } - return 0; + return sprintf(buf, "unsupported\n"); +} + +#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ +static struct f2fs_attr f2fs_attr_sb_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sb_feature_show, \ + .id = F2FS_FEATURE_##_feat, \ } #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ @@ -474,6 +754,11 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .offset = _offset \ } +#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0444, \ + f2fs_sbi_show, NULL, \ + offsetof(struct struct_name, elname)) + #define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0644, \ f2fs_sbi_show, f2fs_sbi_store, \ @@ -482,13 +767,6 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) -#define F2FS_FEATURE_RO_ATTR(_name, _id) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = f2fs_feature_show, \ - .id = _id, \ -} - #define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ @@ -505,8 +783,11 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); @@ -519,6 +800,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -529,21 +811,33 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +#ifdef CONFIG_F2FS_IOSTAT F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); +#endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); +F2FS_GENERAL_RO_ATTR(ovp_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); +F2FS_GENERAL_RO_ATTR(mounted_time_sec); +F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -555,25 +849,58 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #endif #ifdef CONFIG_FS_ENCRYPTION -F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +F2FS_FEATURE_RO_ATTR(encryption); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); +#if IS_ENABLED(CONFIG_UNICODE) +F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED -F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +F2FS_FEATURE_RO_ATTR(block_zoned); +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec, + unusable_blocks_per_sec); #endif -F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); -F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); -F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); -F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); -F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); -F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); -F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); -F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +F2FS_FEATURE_RO_ATTR(atomic_write); +F2FS_FEATURE_RO_ATTR(extra_attr); +F2FS_FEATURE_RO_ATTR(project_quota); +F2FS_FEATURE_RO_ATTR(inode_checksum); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr); +F2FS_FEATURE_RO_ATTR(quota_ino); +F2FS_FEATURE_RO_ATTR(inode_crtime); +F2FS_FEATURE_RO_ATTR(lost_found); #ifdef CONFIG_FS_VERITY -F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); +F2FS_FEATURE_RO_ATTR(verity); #endif -F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); -F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); -F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_FEATURE_RO_ATTR(sb_checksum); +#if IS_ENABLED(CONFIG_UNICODE) +F2FS_FEATURE_RO_ATTR(casefold); +#endif +F2FS_FEATURE_RO_ATTR(readonly); +#ifdef CONFIG_F2FS_FS_COMPRESSION +F2FS_FEATURE_RO_ATTR(compression); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); +#endif +F2FS_FEATURE_RO_ATTR(pin_file); + +/* For ATGC */ +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); + +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); + +/* For atomic write */ +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -586,7 +913,12 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reclaim_segments), ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), + ATTR_LIST(max_discard_request), + ATTR_LIST(min_discard_issue_time), + ATTR_LIST(mid_discard_issue_time), + ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), + ATTR_LIST(pending_discard), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), @@ -600,27 +932,38 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(max_roll_forward_node_blocks), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), +#ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), + ATTR_LIST(iostat_period_ms), +#endif ATTR_LIST(readdir_ra), + ATTR_LIST(max_io_bytes), ATTR_LIST(gc_pin_file_thresh), ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif + ATTR_LIST(data_io_flag), + ATTR_LIST(node_io_flag), + ATTR_LIST(gc_urgent_high_remaining), + ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), + ATTR_LIST(ovp_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), + ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), ATTR_LIST(cp_background_calls), @@ -630,6 +973,28 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), #endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(unusable_blocks_per_sec), +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compr_written_block), + ATTR_LIST(compr_saved_block), + ATTR_LIST(compr_new_inode), +#endif + /* For ATGC */ + ATTR_LIST(atgc_candidate_ratio), + ATTR_LIST(atgc_candidate_count), + ATTR_LIST(atgc_age_weight), + ATTR_LIST(atgc_age_threshold), + ATTR_LIST(seq_file_ra_mul), + ATTR_LIST(gc_segment_mode), + ATTR_LIST(gc_reclaimed_segments), + ATTR_LIST(max_fragment_chunk), + ATTR_LIST(max_fragment_hole), + ATTR_LIST(current_atomic_write), + ATTR_LIST(peak_atomic_write), + ATTR_LIST(committed_atomic_block), + ATTR_LIST(revoked_atomic_block), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -637,7 +1002,11 @@ ATTRIBUTE_GROUPS(f2fs); static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), +#if IS_ENABLED(CONFIG_UNICODE) + ATTR_LIST(encrypted_casefold), #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), #endif @@ -653,12 +1022,61 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), +#if IS_ENABLED(CONFIG_UNICODE) ATTR_LIST(casefold), +#endif + ATTR_LIST(readonly), +#ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), +#endif + ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); +F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); +static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), + ATTR_LIST(cp_status), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_stat); + +F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT); +F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED); +F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR); +F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA); +F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO); +F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME); +F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND); +F2FS_SB_FEATURE_RO_ATTR(verity, VERITY); +F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); +F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); +F2FS_SB_FEATURE_RO_ATTR(readonly, RO); + +static struct attribute *f2fs_sb_feat_attrs[] = { + ATTR_LIST(sb_encryption), + ATTR_LIST(sb_block_zoned), + ATTR_LIST(sb_extra_attr), + ATTR_LIST(sb_project_quota), + ATTR_LIST(sb_inode_checksum), + ATTR_LIST(sb_flexible_inline_xattr), + ATTR_LIST(sb_quota_ino), + ATTR_LIST(sb_inode_crtime), + ATTR_LIST(sb_lost_found), + ATTR_LIST(sb_verity), + ATTR_LIST(sb_sb_checksum), + ATTR_LIST(sb_casefold), + ATTR_LIST(sb_compression), + ATTR_LIST(sb_readonly), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_sb_feat); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -675,7 +1093,7 @@ static struct kobj_type f2fs_ktype = { }; static struct kset f2fs_kset = { - .kobj = {.ktype = &f2fs_ktype}, + .kobj = {.ktype = &f2fs_ktype}, }; static struct kobj_type f2fs_feat_ktype = { @@ -687,6 +1105,71 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_groups = f2fs_stat_groups, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + +static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static void f2fs_feature_list_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + complete(&sbi->s_feature_list_kobj_unregister); +} + +static const struct sysfs_ops f2fs_feature_list_attr_ops = { + .show = f2fs_sb_feat_attr_show, +}; + +static struct kobj_type f2fs_feature_list_ktype = { + .default_groups = f2fs_sb_feat_groups, + .sysfs_ops = &f2fs_feature_list_attr_ops, + .release = f2fs_feature_list_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -738,49 +1221,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, return 0; } -static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, - void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - time64_t now = ktime_get_real_seconds(); - - if (!sbi->iostat_enable) - return 0; - - seq_printf(seq, "time: %-16llu\n", now); - - /* print app IOs */ - seq_printf(seq, "app buffered: %-16llu\n", - sbi->write_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", - sbi->write_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", - sbi->write_iostat[APP_MAPPED_IO]); - - /* print fs IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->write_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->write_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->write_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->write_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", - sbi->write_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", - sbi->write_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", - sbi->write_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", - sbi->write_iostat[FS_CP_META_IO]); - seq_printf(seq, "fs discard: %-16llu\n", - sbi->write_iostat[FS_DISCARD]); - - return 0; -} - static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, void *offset) { @@ -841,37 +1281,72 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) { - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - return err; - } + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; + + sbi->s_feature_list_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_feature_list_kobj_unregister); + err = kobject_init_and_add(&sbi->s_feature_list_kobj, + &f2fs_feature_list_ktype, + &sbi->s_kobj, "feature_list"); + if (err) + goto put_feature_list_kobj; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (sbi->s_proc) { - proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_info", 0444, sbi->s_proc, segment_info_seq_show, sb); - proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); - proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, +#ifdef CONFIG_F2FS_IOSTAT + proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); - proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, +#endif + proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); } return 0; +put_feature_list_kobj: + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { if (sbi->s_proc) { +#ifdef CONFIG_F2FS_IOSTAT remove_proc_entry("iostat_info", sbi->s_proc); +#endif remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_feature_list_kobj); + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); + kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); } diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c deleted file mode 100644 index d0ab533a9ce8..000000000000 --- a/fs/f2fs/trace.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> - */ -#include <linux/fs.h> -#include <linux/f2fs_fs.h> -#include <linux/sched.h> -#include <linux/radix-tree.h> - -#include "f2fs.h" -#include "trace.h" - -static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; -static struct last_io_info last_io; - -static inline void __print_last_io(void) -{ - if (!last_io.len) - return; - - trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", - last_io.major, last_io.minor, - last_io.pid, "----------------", - last_io.type, - last_io.fio.op, last_io.fio.op_flags, - last_io.fio.new_blkaddr, - last_io.len); - memset(&last_io, 0, sizeof(last_io)); -} - -static int __file_type(struct inode *inode, pid_t pid) -{ - if (f2fs_is_atomic_file(inode)) - return __ATOMIC_FILE; - else if (f2fs_is_volatile_file(inode)) - return __VOLATILE_FILE; - else if (S_ISDIR(inode->i_mode)) - return __DIR_FILE; - else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) - return __NODE_FILE; - else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) - return __META_FILE; - else if (pid) - return __NORMAL_FILE; - else - return __MISC_FILE; -} - -void f2fs_trace_pid(struct page *page) -{ - struct inode *inode = page->mapping->host; - pid_t pid = task_pid_nr(current); - void *p; - - set_page_private(page, (unsigned long)pid); - -retry: - if (radix_tree_preload(GFP_NOFS)) - return; - - spin_lock(&pids_lock); - p = radix_tree_lookup(&pids, pid); - if (p == current) - goto out; - if (p) - radix_tree_delete(&pids, pid); - - if (radix_tree_insert(&pids, pid, current)) { - spin_unlock(&pids_lock); - radix_tree_preload_end(); - cond_resched(); - goto retry; - } - - trace_printk("%3x:%3x %4x %-16s\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), - pid, current->comm); -out: - spin_unlock(&pids_lock); - radix_tree_preload_end(); -} - -void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) -{ - struct inode *inode; - pid_t pid; - int major, minor; - - if (flush) { - __print_last_io(); - return; - } - - inode = fio->page->mapping->host; - pid = page_private(fio->page); - - major = MAJOR(inode->i_sb->s_dev); - minor = MINOR(inode->i_sb->s_dev); - - if (last_io.major == major && last_io.minor == minor && - last_io.pid == pid && - last_io.type == __file_type(inode, pid) && - last_io.fio.op == fio->op && - last_io.fio.op_flags == fio->op_flags && - last_io.fio.new_blkaddr + last_io.len == - fio->new_blkaddr) { - last_io.len++; - return; - } - - __print_last_io(); - - last_io.major = major; - last_io.minor = minor; - last_io.pid = pid; - last_io.type = __file_type(inode, pid); - last_io.fio = *fio; - last_io.len = 1; - return; -} - -void f2fs_build_trace_ios(void) -{ - spin_lock_init(&pids_lock); -} - -#define PIDVEC_SIZE 128 -static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, - unsigned int max_items) -{ - struct radix_tree_iter iter; - void **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, &pids, &iter, first_index) { - results[ret] = iter.index; - if (++ret == max_items) - break; - } - return ret; -} - -void f2fs_destroy_trace_ios(void) -{ - pid_t pid[PIDVEC_SIZE]; - pid_t next_pid = 0; - unsigned int found; - - spin_lock(&pids_lock); - while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { - unsigned idx; - - next_pid = pid[found - 1] + 1; - for (idx = 0; idx < found; idx++) - radix_tree_delete(&pids, pid[idx]); - } - spin_unlock(&pids_lock); -} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h deleted file mode 100644 index e8075fc5b228..000000000000 --- a/fs/f2fs/trace.h +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> - */ -#ifndef __F2FS_TRACE_H__ -#define __F2FS_TRACE_H__ - -#ifdef CONFIG_F2FS_IO_TRACE -#include <trace/events/f2fs.h> - -enum file_type { - __NORMAL_FILE, - __DIR_FILE, - __NODE_FILE, - __META_FILE, - __ATOMIC_FILE, - __VOLATILE_FILE, - __MISC_FILE, -}; - -struct last_io_info { - int major, minor; - pid_t pid; - enum file_type type; - struct f2fs_io_info fio; - block_t len; -}; - -extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct f2fs_io_info *, int); -extern void f2fs_build_trace_ios(void); -extern void f2fs_destroy_trace_ios(void); -#else -#define f2fs_trace_pid(p) -#define f2fs_trace_ios(i, n) -#define f2fs_build_trace_ios() -#define f2fs_destroy_trace_ios() - -#endif -#endif /* __F2FS_TRACE_H__ */ diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index d7d430a6f130..c352fff88a5e 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -29,6 +29,8 @@ #include "f2fs.h" #include "xattr.h" +#define F2FS_VERIFY_VER (1) + static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode) { return round_up(inode->i_size, 65536); @@ -45,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -72,6 +71,9 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, static int pagecache_write(struct inode *inode, const void *buf, size_t count, loff_t pos) { + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + if (pos + count > inode->i_sb->s_maxbytes) return -EFBIG; @@ -80,20 +82,15 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; - res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, - &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); - res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, - page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); if (res < 0) return res; if (res != n) @@ -126,7 +123,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* @@ -134,7 +131,7 @@ static int f2fs_begin_enable_verity(struct file *filp) * here and not rely on ->open() doing it. This must be done before * evicting the inline data. */ - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; @@ -150,40 +147,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size) { struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; struct fsverity_descriptor_location dloc = { - .version = cpu_to_le32(1), + .version = cpu_to_le32(F2FS_VERIFY_VER), .size = cpu_to_le32(desc_size), .pos = cpu_to_le64(desc_pos), }; - int err = 0; + int err = 0, err2 = 0; - if (desc != NULL) { - /* Succeeded; write the verity descriptor. */ - err = pagecache_write(inode, desc, desc_size, desc_pos); + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; - /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */ - if (!err) - err = filemap_write_and_wait(inode->i_mapping); - } + /* Append the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + goto cleanup; - /* If we failed, truncate anything we wrote past i_size. */ - if (desc == NULL || err) - f2fs_truncate(inode); + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond + * i_size won't be written properly. For crash consistency, this also + * must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* Set the verity xattr. */ + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (err) + goto cleanup; + + /* Finally, set the verity inode flag. */ + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; - if (desc != NULL && !err) { - err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, - F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), - NULL, XATTR_CREATE); - if (!err) { - file_set_verity(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, true); - } +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk) and clearing FI_VERITY_IN_PROGRESS. + * + * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection + * from re-instantiating cached pages we are truncating (since unlike + * normal file accesses, garbage collection isn't limited by i_size). + */ + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages(inode->i_mapping, inode->i_size); + err2 = f2fs_truncate(inode); + if (err2) { + f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)", + err2); + set_sbi_flag(sbi, SBI_NEED_FSCK); } - return err; + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return err ?: err2; } static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, @@ -199,7 +229,7 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL); if (res < 0 && res != -ERANGE) return res; - if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) { + if (res != sizeof(dloc) || dloc.version != cpu_to_le32(F2FS_VERIFY_VER)) { f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format"); return -EINVAL; } @@ -210,6 +240,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); return -EFSCORRUPTED; } if (buf_size) { @@ -222,37 +254,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void f2fs_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -263,11 +264,12 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) - f2fs_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 296b3189448a..dc2e8637189e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -23,6 +23,26 @@ #include "xattr.h" #include "segment.h" +static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) +{ + if (likely(size == sbi->inline_xattr_slab_size)) { + *is_inline = true; + return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab, + GFP_F2FS_ZERO, false, sbi); + } + *is_inline = false; + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, + bool is_inline) +{ + if (is_inline) + kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); + else + kfree(xattr_addr); +} + static int f2fs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) @@ -45,6 +65,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -88,6 +109,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -95,7 +117,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, unsigned char old_advise = F2FS_I(inode)->i_advise; unsigned char new_advise; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (value == NULL) return -EINVAL; @@ -156,8 +178,8 @@ const struct xattr_handler f2fs_xattr_trusted_handler = { const struct xattr_handler f2fs_xattr_advise_handler = { .name = F2FS_SYSTEM_ADVISE_NAME, .flags = F2FS_XATTR_INDEX_ADVISE, - .get = f2fs_xattr_advise_get, - .set = f2fs_xattr_advise_set, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, }; const struct xattr_handler f2fs_xattr_security_handler = { @@ -204,15 +226,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index) } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, - void *last_base_addr, int index, - size_t len, const char *name) + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { if ((void *)(entry) + sizeof(__u32) > last_base_addr || - (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; return NULL; + } if (entry->e_name_index != index) continue; @@ -232,19 +257,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, unsigned int inline_size = inline_xattr_size(inode); void *max_addr = base_addr + inline_size; - list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > max_addr || - (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { - *last_addr = entry; - return NULL; - } - if (entry->e_name_index != index) - continue; - if (entry->e_name_len != len) - continue; - if (!memcmp(entry->e_name, name, len)) - break; - } + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; /* inline xattr header or entry across max inline xattr size */ if (IS_XATTR_LAST_ENTRY(entry) && @@ -301,23 +316,24 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, - void **base_addr, int *base_size) + void **base_addr, int *base_size, + bool *is_inline) { void *cur_addr, *txattr_addr, *last_txattr_addr; void *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - int err = 0; + int err; if (!xnid && !inline_size) return -ENODATA; - *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE; - txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); + *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE; + txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline); if (!txattr_addr) return -ENOMEM; - last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode); + last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode); /* read from inline xattr */ if (inline_size) { @@ -345,12 +361,14 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, else cur_addr = txattr_addr; - *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name); + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto out; } check: @@ -362,7 +380,7 @@ check: *base_addr = txattr_addr; return 0; out: - kvfree(txattr_addr); + xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline); return err; } @@ -405,7 +423,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kvfree(txattr_addr); + kfree(txattr_addr); return err; } @@ -466,6 +484,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, f2fs_wait_on_page_writeback(xpage, NODE, true, true); } else { struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { @@ -495,10 +514,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry = NULL; - int error = 0; + int error; unsigned int size, len; void *base_addr = NULL; int base_size; + bool is_inline; if (name == NULL) return -EINVAL; @@ -507,10 +527,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, - &entry, &base_addr, &base_size); - up_read(&F2FS_I(inode)->i_xattr_sem); + &entry, &base_addr, &base_size, &is_inline); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -532,26 +552,25 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kvfree(base_addr); + xattr_free(F2FS_I_SB(inode), base_addr, is_inline); return error; } ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; - int error = 0; + int error; size_t rest = buffer_size; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); - up_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; - last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -566,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto cleanup; } @@ -590,7 +611,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kvfree(base_addr); + kfree(base_addr); return error; } @@ -609,11 +630,10 @@ static int __f2fs_setxattr(struct inode *inode, int index, { struct f2fs_xattr_entry *here, *last; void *base_addr, *last_base_addr; - nid_t xnid = F2FS_I(inode)->i_xattr_nid; int found, newsize; size_t len; __u32 new_hsize; - int error = 0; + int error; if (name == NULL) return -EINVAL; @@ -633,15 +653,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) return error; - last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ - here = __find_xattr(base_addr, last_base_addr, index, len, name); + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } @@ -654,15 +676,26 @@ static int __f2fs_setxattr(struct inode *inode, int index, } if (value && f2fs_xattr_value_same(here, value, size)) - goto exit; + goto same; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; } last = here; - while (!IS_XATTR_LAST_ENTRY(last)) + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + goto exit; + } last = XATTR_NEXT_ENTRY(last); + } newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); @@ -719,19 +752,22 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); - clear_inode_flag(inode, FI_ACL_MODE); - } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + exit: - kvfree(base_addr); + kfree(base_addr); return error; } @@ -747,7 +783,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; @@ -758,14 +794,34 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - /* protect xattr_ver */ - down_write(&F2FS_I(inode)->i_sem); - down_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - up_write(&F2FS_I(inode)->i_xattr_sem); - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); f2fs_unlock_op(sbi); f2fs_update_time(sbi, REQ_TIME); return err; } + +int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size * + sizeof(__le32) + XATTR_PADDING_SIZE; + + sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name, + sbi->inline_xattr_slab_size); + if (!sbi->inline_xattr_slab) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->inline_xattr_slab); +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index de0c600b9cab..416d652774a3 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/xattr.h * @@ -49,7 +49,7 @@ struct f2fs_xattr_entry { __u8 e_name_index; __u8 e_name_len; __le16 e_value_size; /* size of attribute value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) @@ -73,7 +73,8 @@ struct f2fs_xattr_entry { entry = XATTR_NEXT_ENTRY(entry)) #define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) #define XATTR_PADDING_SIZE (sizeof(__u32)) -#define XATTR_SIZE(x,i) (((x) ? VALID_XATTR_BLOCK_SIZE : 0) + \ +#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \ + VALID_XATTR_BLOCK_SIZE : 0) + \ (inline_xattr_size(i))) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) @@ -130,9 +131,12 @@ extern int f2fs_setxattr(struct inode *, int, const char *, extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t, struct page *); extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +extern int f2fs_init_xattr_caches(struct f2fs_sb_info *); +extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); #else #define f2fs_xattr_handlers NULL +#define f2fs_listxattr NULL static inline int f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *page, int flags) @@ -145,11 +149,8 @@ static inline int f2fs_getxattr(struct inode *inode, int index, { return -EOPNOTSUPP; } -static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, - size_t buffer_size) -{ - return -EOPNOTSUPP; -} +static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { } #endif #ifdef CONFIG_F2FS_FS_SECURITY |