From 40a8f0d5e7b3999f096570edab71c345da812e3e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:32 +0800 Subject: ubifs: rename_whiteout: Fix double free for whiteout_ui->data 'whiteout_ui->data' will be freed twice if space budget fail for rename whiteout operation as following process: rename_whiteout dev = kmalloc whiteout_ui->data = dev kfree(whiteout_ui->data) // Free first time iput(whiteout) ubifs_free_inode kfree(ui->data) // Double free! KASAN reports: ================================================================== BUG: KASAN: double-free or invalid-free in ubifs_free_inode+0x4f/0x70 Call Trace: kfree+0x117/0x490 ubifs_free_inode+0x4f/0x70 [ubifs] i_callback+0x30/0x60 rcu_do_batch+0x366/0xac0 __do_softirq+0x133/0x57f Allocated by task 1506: kmem_cache_alloc_trace+0x3c2/0x7a0 do_rename+0x9b7/0x1150 [ubifs] ubifs_rename+0x106/0x1f0 [ubifs] do_syscall_64+0x35/0x80 Freed by task 1506: kfree+0x117/0x490 do_rename.cold+0x53/0x8a [ubifs] ubifs_rename+0x106/0x1f0 [ubifs] do_syscall_64+0x35/0x80 The buggy address belongs to the object at ffff88810238bed8 which belongs to the cache kmalloc-8 of size 8 ================================================================== Let ubifs_free_inode() free 'whiteout_ui->data'. BTW, delete unused assignment 'whiteout_ui->data_len = 0', process 'ubifs_evict_inode() -> ubifs_jnl_delete_inode() -> ubifs_jnl_write_inode()' doesn't need it (because 'inc_nlink(whiteout)' won't be excuted by 'goto out_release', and the nlink of whiteout inode is 0). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index dbe72f664abf..d3ed93d94930 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1425,8 +1425,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, err = ubifs_budget_space(c, &wht_req); if (err) { - kfree(whiteout_ui->data); - whiteout_ui->data_len = 0; iput(whiteout); goto out_release; } -- cgit v1.2.3-59-g8ed1b From afd427048047e8efdedab30e8888044e2be5aa9c Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:33 +0800 Subject: ubifs: Fix deadlock in concurrent rename whiteout and inode writeback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following hung tasks: [ 77.028764] task:kworker/u8:4 state:D stack: 0 pid: 132 [ 77.028820] Call Trace: [ 77.029027] schedule+0x8c/0x1b0 [ 77.029067] mutex_lock+0x50/0x60 [ 77.029074] ubifs_write_inode+0x68/0x1f0 [ubifs] [ 77.029117] __writeback_single_inode+0x43c/0x570 [ 77.029128] writeback_sb_inodes+0x259/0x740 [ 77.029148] wb_writeback+0x107/0x4d0 [ 77.029163] wb_workfn+0x162/0x7b0 [ 92.390442] task:aa state:D stack: 0 pid: 1506 [ 92.390448] Call Trace: [ 92.390458] schedule+0x8c/0x1b0 [ 92.390461] wb_wait_for_completion+0x82/0xd0 [ 92.390469] __writeback_inodes_sb_nr+0xb2/0x110 [ 92.390472] writeback_inodes_sb_nr+0x14/0x20 [ 92.390476] ubifs_budget_space+0x705/0xdd0 [ubifs] [ 92.390503] do_rename.cold+0x7f/0x187 [ubifs] [ 92.390549] ubifs_rename+0x8b/0x180 [ubifs] [ 92.390571] vfs_rename+0xdb2/0x1170 [ 92.390580] do_renameat2+0x554/0x770 , are caused by concurrent rename whiteout and inode writeback processes: rename_whiteout(Thread 1) wb_workfn(Thread2) ubifs_rename do_rename lock_4_inodes (Hold ui_mutex) ubifs_budget_space make_free_space shrink_liability __writeback_inodes_sb_nr bdi_split_work_to_wbs (Queue new wb work) wb_do_writeback(wb work) __writeback_single_inode ubifs_write_inode LOCK(ui_mutex) ↑ wb_wait_for_completion (Wait wb work) <-- deadlock! Reproducer (Detail program in [Link]): 1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT) 2. Consume out of space before kernel(mdelay) doing budget for whiteout Fix it by doing whiteout space budget before locking ubifs inodes. BTW, it also fixes wrong goto tag 'out_release' in whiteout budget error handling path(It should at least recover dir i_size and unlock 4 ubifs inodes). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index d3ed93d94930..f4783e1ae775 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1324,6 +1324,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_WHITEOUT) { union ubifs_dev_desc *dev = NULL; + struct ubifs_budget_req wht_req; dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) { @@ -1345,6 +1346,20 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); + + memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); + wht_req.dirtied_ino = 1; + wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8); + /* + * To avoid deadlock between space budget (holds ui_mutex and + * waits wb work) and writeback work(waits ui_mutex), do space + * budget before ubifs inodes locked. + */ + err = ubifs_budget_space(c, &wht_req); + if (err) { + iput(whiteout); + goto out_release; + } } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); @@ -1419,16 +1434,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, } if (whiteout) { - struct ubifs_budget_req wht_req = { .dirtied_ino = 1, - .dirtied_ino_d = \ - ALIGN(ubifs_inode(whiteout)->data_len, 8) }; - - err = ubifs_budget_space(c, &wht_req); - if (err) { - iput(whiteout); - goto out_release; - } - inc_nlink(whiteout); mark_inode_dirty(whiteout); -- cgit v1.2.3-59-g8ed1b From 7a8884feec908afd05ebd45db5eba9a03674137e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:34 +0800 Subject: ubifs: Fix wrong number of inodes locked by ui_mutex in ubifs_inode comment Since 9ec64962afb1702f75b("ubifs: Implement RENAME_EXCHANGE") and 9e0a1fff8db56eaaebb("ubifs: Implement RENAME_WHITEOUT") are applied, ubifs_rename locks and changes 4 ubifs inodes, correct the comment for ui_mutex in ubifs_inode. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/ubifs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index f55828c0a300..008fa46ef61e 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -381,7 +381,7 @@ struct ubifs_gced_idx_leb { * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS * operation. This way UBIFS makes sure the inode fields are consistent. For - * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and * write-back must not write any of them before we have finished. * * The second reason is budgeting - UBIFS has to budget all operations. If an -- cgit v1.2.3-59-g8ed1b From 716b4573026bcbfa7b58ed19fe15554bac66b082 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:35 +0800 Subject: ubifs: Add missing iput if do_tmpfile() failed in rename whiteout whiteout inode should be put when do_tmpfile() failed if inode has been initialized. Otherwise we will get following warning during umount: UBIFS error (ubi0:0 pid 1494): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth == 0, in fs/ubifs/super.c:1930 VFS: Busy inodes after unmount of ubifs. Self-destruct in 5 seconds. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Suggested-by: Sascha Hauer Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f4783e1ae775..745e6e3f246e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -432,6 +432,8 @@ out_inode: make_bad_inode(inode); if (!instantiated) iput(inode); + else if (whiteout) + iput(*whiteout); out_budg: ubifs_release_budget(c, &req); if (!instantiated) -- cgit v1.2.3-59-g8ed1b From 278d9a243635f26c05ad95dcf9c5a593b9e04dc6 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:36 +0800 Subject: ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 144 ++++++++++++++++++++++++++++++++++------------------- fs/ubifs/journal.c | 52 ++++++++++++++++--- 2 files changed, 136 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 745e6e3f246e..f2a6b45bfdae 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -349,8 +349,56 @@ out_budg: return err; } -static int do_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err; + umode_t mode = S_IFCHR | WHITEOUT_MODE; + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm; + + /* + * Create an inode('nlink = 1') for whiteout without updating journal, + * let ubifs_jnl_rename() store it on flash to complete rename whiteout + * atomically. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + return ERR_PTR(err); + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + /* The dir size is updated by do_rename. */ + insert_inode_hash(inode); + + return inode; + +out_inode: + make_bad_inode(inode); + iput(inode); +out_free: + fscrypt_free_filename(&nm); + ubifs_err(c, "cannot create whiteout file, error %d", err); + return ERR_PTR(err); +} + +static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; @@ -392,25 +440,13 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, } ui = ubifs_inode(inode); - if (whiteout) { - init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); - ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); - } - err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) goto out_inode; mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - - if (whiteout) { - mark_inode_dirty(inode); - drop_nlink(inode); - *whiteout = inode; - } else { - d_tmpfile(dentry, inode); - } + d_tmpfile(dentry, inode); ubifs_assert(c, ui->dirty); instantiated = 1; @@ -432,8 +468,6 @@ out_inode: make_bad_inode(inode); if (!instantiated) iput(inode); - else if (whiteout) - iput(*whiteout); out_budg: ubifs_release_budget(c, &req); if (!instantiated) @@ -443,12 +477,6 @@ out_budg: return err; } -static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - return do_tmpfile(dir, dentry, mode, NULL); -} - /** * vfs_dent_type - get VFS directory entry type. * @type: UBIFS directory entry type @@ -1266,17 +1294,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct ubifs_budget_req wht_req; struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; /* - * Budget request settings: deletion direntry, new direntry, removing - * the old inode, and changing old and new parent directory inodes. + * Budget request settings: + * req: deletion direntry, new direntry, removing the old inode, + * and changing old and new parent directory inodes. * - * However, this operation also marks the target inode as dirty and - * does not write it, so we allocate budget for the target inode - * separately. + * wht_req: new whiteout inode for RENAME_WHITEOUT. + * + * ino_req: marks the target inode as dirty and does not write it. */ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", @@ -1326,7 +1356,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_WHITEOUT) { union ubifs_dev_desc *dev = NULL; - struct ubifs_budget_req wht_req; dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!dev) { @@ -1334,24 +1363,26 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_release; } - err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); - if (err) { + /* + * The whiteout inode without dentry is pinned in memory, + * umount won't happen during rename process because we + * got parent dentry. + */ + whiteout = create_whiteout(old_dir, old_dentry); + if (IS_ERR(whiteout)) { + err = PTR_ERR(whiteout); kfree(dev); goto out_release; } - spin_lock(&whiteout->i_lock); - whiteout->i_state |= I_LINKABLE; - spin_unlock(&whiteout->i_lock); - whiteout_ui = ubifs_inode(whiteout); whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); - wht_req.dirtied_ino = 1; - wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8); + wht_req.new_ino = 1; + wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8); /* * To avoid deadlock between space budget (holds ui_mutex and * waits wb work) and writeback work(waits ui_mutex), do space @@ -1359,6 +1390,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, */ err = ubifs_budget_space(c, &wht_req); if (err) { + /* + * Whiteout inode can not be written on flash by + * ubifs_jnl_write_inode(), because it's neither + * dirty nor zero-nlink. + */ iput(whiteout); goto out_release; } @@ -1433,17 +1469,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); if (unlink && IS_SYNC(new_inode)) sync = 1; - } - - if (whiteout) { - inc_nlink(whiteout); - mark_inode_dirty(whiteout); - - spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; - spin_unlock(&whiteout->i_lock); - - iput(whiteout); + /* + * S_SYNC flag of whiteout inherits from the old_dir, and we + * have already checked the old dir inode. So there is no need + * to check whiteout. + */ } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, @@ -1454,6 +1484,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &req); + if (whiteout) { + ubifs_release_budget(c, &wht_req); + iput(whiteout); + } + mutex_lock(&old_inode_ui->ui_mutex); release = old_inode_ui->dirty; mark_inode_dirty_sync(old_inode); @@ -1462,11 +1497,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + /* + * Rename finished here. Although old inode cannot be updated + * on flash, old ctime is not a big problem, don't return err + * code to userspace. + */ + old_inode->i_sb->s_op->write_inode(old_inode, NULL); fscrypt_free_filename(&old_nm); fscrypt_free_filename(&new_nm); - return err; + return 0; out_cancel: if (unlink) { @@ -1487,11 +1527,11 @@ out_cancel: inc_nlink(old_dir); } } + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { - drop_nlink(whiteout); + ubifs_release_budget(c, &wht_req); iput(whiteout); } - unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); out_release: ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 8ea680dba61e..75dab0ae3939 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1207,9 +1207,9 @@ out_free: * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up - * to 4 inodes and 2 directory entries. It marks the written inodes as clean - * and returns zero on success. In case of failure, a negative error code is - * returned. + * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) + * and 2 directory entries. It marks the written inodes as clean and returns + * zero on success. In case of failure, a negative error code is returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *old_inode, @@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; - int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0; + int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0; int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); - struct ubifs_inode *new_ui; + struct ubifs_inode *new_ui, *whiteout_ui; u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; + u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ]; u8 hash_dent1[UBIFS_HASH_ARR_SZ]; u8 hash_dent2[UBIFS_HASH_ARR_SZ]; @@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } else ilen = 0; + if (whiteout) { + whiteout_ui = ubifs_inode(whiteout); + ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex)); + ubifs_assert(c, whiteout->i_nlink == 1); + ubifs_assert(c, !whiteout_ui->dirty); + wlen = UBIFS_INO_NODE_SZ; + wlen += whiteout_ui->data_len; + } else + wlen = 0; + aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); - len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + + ALIGN(wlen, 8) + ALIGN(plen, 8); if (move) len += plen; @@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p += ALIGN(ilen, 8); } + if (whiteout) { + pack_inode(c, p, whiteout, 0); + err = ubifs_node_calc_hash(c, p, hash_whiteout_inode); + if (err) + goto out_release; + + p += ALIGN(wlen, 8); + } + if (!move) { pack_inode(c, p, old_dir, 1); err = ubifs_node_calc_hash(c, p, hash_old_dir); @@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (new_inode) ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, new_inode->i_ino); + if (whiteout) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + whiteout->i_ino); } release_head(c, BASEHD); @@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm); if (err) goto out_ro; - - ubifs_delete_orphan(c, whiteout->i_ino); } else { err = ubifs_add_dirt(c, lnum, dlen2); if (err) @@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, offs += ALIGN(ilen, 8); } + if (whiteout) { + ino_key_init(c, &key, whiteout->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, wlen, + hash_whiteout_inode); + if (err) + goto out_ro; + offs += ALIGN(wlen, 8); + } + ino_key_init(c, &key, old_dir->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir); if (err) @@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, new_ui->synced_i_size = new_ui->ui_size; spin_unlock(&new_ui->ui_lock); } + /* + * No need to mark whiteout inode clean. + * Whiteout doesn't have non-zero size, no need to update + * synced_i_size for whiteout_ui. + */ mark_inode_clean(c, ubifs_inode(old_dir)); if (move) mark_inode_clean(c, ubifs_inode(new_dir)); -- cgit v1.2.3-59-g8ed1b From 60eb3b9c9f11206996f57cb89521824304b305ad Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:37 +0800 Subject: ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 60 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f2a6b45bfdae..faca567ef06b 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -397,6 +397,32 @@ out_free: return ERR_PTR(err); } +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -404,7 +430,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct ubifs_info *c = dir->i_sb->s_fs_info; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; - struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); + struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; @@ -452,18 +478,18 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, instantiated = 1; mutex_unlock(&ui->ui_mutex); - mutex_lock(&dir_ui->ui_mutex); + lock_2_inodes(dir, inode); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); return 0; out_cancel: - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); out_inode: make_bad_inode(inode); if (!instantiated) @@ -690,32 +716,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) return 0; } -/** - * lock_2_inodes - a wrapper for locking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - * - * We do not implement any tricks to guarantee strict lock ordering, because - * VFS has already done it for us on the @i_mutex. So this is just a simple - * wrapper function. - */ -static void lock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); -} - -/** - * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - */ -static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); -} - static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { -- cgit v1.2.3-59-g8ed1b From a6dab6607d4681d227905d5198710b575dbdb519 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:38 +0800 Subject: ubifs: Rectify space amount budget for mkdir/tmpfile operations UBIFS should make sure the flash has enough space to store dirty (Data that is newer than disk) data (in memory), space budget is exactly designed to do that. If space budget calculates less data than we need, 'make_reservation()' will do more work(return -ENOSPC if no free space lelf, sometimes we can see "cannot reserve xxx bytes in jhead xxx, error -28" in ubifs error messages) with ubifs inodes locked, which may effect other syscalls. A simple way to decide how much space do we need when make a budget: See how much space is needed by 'make_reservation()' in ubifs_jnl_xxx() function according to corresponding operation. It's better to report ENOSPC in ubifs_budget_space(), as early as we can. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Fixes: 1e51764a3c2ac05 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index faca567ef06b..ae082a0be2a3 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -428,15 +428,18 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; /* - * Budget request settings: new dirty inode, new direntry, - * budget for dirtied inode will be released via writeback. + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + * Allocate budget separately for new dirtied inode, the budget will + * be released via writeback. */ dbg_gen("dent '%pd', mode %#hx in dir ino %lu", @@ -979,7 +982,8 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct fscrypt_name nm; /* -- cgit v1.2.3-59-g8ed1b From 1b83ec057db16b4d0697dc21ef7a9743b6041f72 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:39 +0800 Subject: ubifs: setflags: Make dirtied_ino_d 8 bytes aligned Make 'ui->data_len' aligned with 8 bytes before it is assigned to dirtied_ino_d. Since 8871d84c8f8b0c6b("ubifs: convert to fileattr") applied, 'setflags()' only affects regular files and directories, only xattr inode, symlink inode and special inode(pipe/char_dev/block_dev) have none- zero 'ui->data_len' field, so assertion '!(req->dirtied_ino_d & 7)' cannot fail in ubifs_budget_space(). To avoid assertion fails in future evolution(eg. setflags can operate special inodes), it's better to make dirtied_ino_d 8 bytes aligned, after all aligned size is still zero for regular files. Fixes: 1e51764a3c2ac05a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index c6a863487780..71bcebe45f9c 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -108,7 +108,7 @@ static int setflags(struct inode *inode, int flags) struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) -- cgit v1.2.3-59-g8ed1b From 4f2262a334641e05f645364d5ade1f565c85f20b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:40 +0800 Subject: ubifs: Fix read out-of-bounds in ubifs_wbuf_write_nolock() Function ubifs_wbuf_write_nolock() may access buf out of bounds in following process: ubifs_wbuf_write_nolock(): aligned_len = ALIGN(len, 8); // Assume len = 4089, aligned_len = 4096 if (aligned_len <= wbuf->avail) ... // Not satisfy if (wbuf->used) { ubifs_leb_write() // Fill some data in avail wbuf len -= wbuf->avail; // len is still not 8-bytes aligned aligned_len -= wbuf->avail; } n = aligned_len >> c->max_write_shift; if (n) { n <<= c->max_write_shift; err = ubifs_leb_write(c, wbuf->lnum, buf + written, wbuf->offs, n); // n > len, read out of bounds less than 8(n-len) bytes } , which can be catched by KASAN: ========================================================= BUG: KASAN: slab-out-of-bounds in ecc_sw_hamming_calculate+0x1dc/0x7d0 Read of size 4 at addr ffff888105594ff8 by task kworker/u8:4/128 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: kasan_report.cold+0x81/0x165 nand_write_page_swecc+0xa9/0x160 ubifs_leb_write+0xf2/0x1b0 [ubifs] ubifs_wbuf_write_nolock+0x421/0x12c0 [ubifs] write_head+0xdc/0x1c0 [ubifs] ubifs_jnl_write_inode+0x627/0x960 [ubifs] wb_workfn+0x8af/0xb80 Function ubifs_wbuf_write_nolock() accepts that parameter 'len' is not 8 bytes aligned, the 'len' represents the true length of buf (which is allocated in 'ubifs_jnl_xxx', eg. ubifs_jnl_write_inode), so ubifs_wbuf_write_nolock() must handle the length read from 'buf' carefully to write leb safely. Fetch a reproducer in [Link]. Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214785 Reported-by: Chengsong Ke Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/io.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 789a7813f3fa..1607a3c76681 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -854,16 +854,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->max_write_shift; + int m = n - 1; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n); + + if (m) { + /* '(n-1)<max_write_shift < len' is always true. */ + m <<= c->max_write_shift; + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, m); + if (err) + goto out; + wbuf->offs += m; + aligned_len -= m; + len -= m; + written += m; + } + + /* + * The non-written len of buf may be less than 'n' because + * parameter 'len' is not 8 bytes aligned, so here we read + * min(len, n) bytes from buf. + */ + n = 1 << c->max_write_shift; + memcpy(wbuf->buf, buf + written, min(len, n)); + if (n > len) { + ubifs_assert(c, n - len < 8); + ubifs_pad(c, wbuf->buf + len, n - len); + } + + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n); if (err) goto out; wbuf->offs += n; aligned_len -= n; - len -= n; + len -= min(len, n); written += n; } -- cgit v1.2.3-59-g8ed1b From 3b67db8a6ca83e6ff90b756d3da0c966f61cd37b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 27 Dec 2021 11:22:41 +0800 Subject: ubifs: Fix to add refcount once page is set private MM defined the rule [1] very clearly that once page was set with PG_private flag, we should increment the refcount in that page, also main flows like pageout(), migrate_page() will assume there is one additional page reference count if page_has_private() returns true. Otherwise, we may get a BUG in page migration: page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8 index:0xe2 pfn:0x14c12 aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e" flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0| zone=1|lastcpupid=0x1fffff) page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0) ------------[ cut here ]------------ kernel BUG at include/linux/page_ref.h:184! invalid opcode: 0000 [#1] SMP CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5 RIP: 0010:migrate_page_move_mapping+0xac3/0xe70 Call Trace: ubifs_migrate_page+0x22/0xc0 [ubifs] move_to_new_page+0xb4/0x600 migrate_pages+0x1523/0x1cc0 compact_zone+0x8c5/0x14b0 kcompactd+0x2bc/0x560 kthread+0x18c/0x1e0 ret_from_fork+0x1f/0x30 Before the time, we should make clean a concept, what does refcount means in page gotten from grab_cache_page_write_begin(). There are 2 situations: Situation 1: refcount is 3, page is created by __page_cache_alloc. TYPE_A - the write process is using this page TYPE_B - page is assigned to one certain mapping by calling __add_to_page_cache_locked() TYPE_C - page is added into pagevec list corresponding current cpu by calling lru_cache_add() Situation 2: refcount is 2, page is gotten from the mapping's tree TYPE_B - page has been assigned to one certain mapping TYPE_A - the write process is using this page (by calling page_cache_get_speculative()) Filesystem releases one refcount by calling put_page() in xxx_write_end(), the released refcount corresponds to TYPE_A (write task is using it). If there are any processes using a page, page migration process will skip the page by judging whether expected_page_refs() equals to page refcount. The BUG is caused by following process: PA(cpu 0) kcompactd(cpu 1) compact_zone ubifs_write_begin page_a = grab_cache_page_write_begin add_to_page_cache_lru lru_cache_add pagevec_add // put page into cpu 0's pagevec (refcnf = 3, for page creation process) ubifs_write_end SetPagePrivate(page_a) // doesn't increase page count ! unlock_page(page_a) put_page(page_a) // refcnt = 2 [...] PB(cpu 0) filemap_read filemap_get_pages add_to_page_cache_lru lru_cache_add __pagevec_lru_add // traverse all pages in cpu 0's pagevec __pagevec_lru_add_fn SetPageLRU(page_a) isolate_migratepages isolate_migratepages_block get_page_unless_zero(page_a) // refcnt = 3 list_add(page_a, from_list) migrate_pages(from_list) __unmap_and_move move_to_new_page ubifs_migrate_page(page_a) migrate_page_move_mapping expected_page_refs get 3 (migration[1] + mapping[1] + private[1]) release_pages put_page_testzero(page_a) // refcnt = 3 page_ref_freeze // refcnt = 0 page_ref_dec_and_test(0 - 1 = -1) page_ref_unfreeze VM_BUG_ON_PAGE(-1 != 0, page) UBIFS doesn't increase the page refcount after setting private flag, which leads to page migration task believes the page is not used by any other processes, so the page is migrated. This causes concurrent accessing on page refcount between put_page() called by other process(eg. read process calls lru_cache_add) and page_ref_unfreeze() called by migration task. Actually zhangjun has tried to fix this problem [2] by recalculating page refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because just like Kirill suggested in [2], we need to check all users of page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting refcount when setting/clearing private for a page. BTW, according to [4], we set 'page->private' as 1 because ubifs just simply SetPagePrivate(). And, [5] provided a common helper to set/clear page private, ubifs can use this helper following the example of iomap, afs, btrfs, etc. Jump [6] to find a reproducer. [1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com [2] https://www.spinics.net/lists/linux-mtd/msg04018.html [3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html [4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.infradead.org [5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.com [6] https://bugzilla.kernel.org/show_bug.cgi?id=214961 Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5cfa28cd00cd..6b45a037a047 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } if (!PagePrivate(page)) { - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } @@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len) release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); kunmap(page); @@ -1304,7 +1304,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); } @@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping, return rc; if (PagePrivate(page)) { - ClearPagePrivate(page); - SetPagePrivate(newpage); + detach_page_private(page); + attach_page_private(newpage, (void *)1); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) return 0; ubifs_assert(c, PagePrivate(page)); ubifs_assert(c, 0); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); return 1; } @@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) else { if (!PageChecked(page)) ubifs_convert_page_budget(c); - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } -- cgit v1.2.3-59-g8ed1b From 163b438b510cf749cdd5e57fc5b40bf72fa09edb Mon Sep 17 00:00:00 2001 From: hongnanli Date: Fri, 11 Feb 2022 11:19:28 +0800 Subject: fs/jffs2: fix comments mentioning i_mutex inode->i_mutex has been replaced with inode->i_rwsem long ago. Fix comments still mentioning i_mutex. Signed-off-by: hongnanli Signed-off-by: Richard Weinberger --- fs/jffs2/jffs2_fs_i.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 2e4a86763c07..93a2951538ce 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -18,11 +18,11 @@ #include struct jffs2_inode_info { - /* We need an internal mutex similar to inode->i_mutex. + /* We need an internal mutex similar to inode->i_rwsem. Unfortunately, we can't used the existing one, because either the GC would deadlock, or we'd have to release it before letting GC proceed. Or we'd have to put ugliness - into the GC code so it didn't attempt to obtain the i_mutex + into the GC code so it didn't attempt to obtain the i_rwsem for the inode(s) which are already locked */ struct mutex sem; -- cgit v1.2.3-59-g8ed1b From 4c7c44ee1650677fbe89d86edbad9497b7679b5c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 28 Dec 2021 20:54:30 +0800 Subject: jffs2: fix use-after-free in jffs2_clear_xattr_subsystem When we mount a jffs2 image, assume that the first few blocks of the image are normal and contain at least one xattr-related inode, but the next block is abnormal. As a result, an error is returned in jffs2_scan_eraseblock(). jffs2_clear_xattr_subsystem() is then called in jffs2_build_filesystem() and then again in jffs2_do_fill_super(). Finally we can observe the following report: ================================================================== BUG: KASAN: use-after-free in jffs2_clear_xattr_subsystem+0x95/0x6ac Read of size 8 at addr ffff8881243384e0 by task mount/719 Call Trace: dump_stack+0x115/0x16b jffs2_clear_xattr_subsystem+0x95/0x6ac jffs2_do_fill_super+0x84f/0xc30 jffs2_fill_super+0x2ea/0x4c0 mtd_get_sb+0x254/0x400 mtd_get_sb_by_nr+0x4f/0xd0 get_tree_mtd+0x498/0x840 jffs2_get_tree+0x25/0x30 vfs_get_tree+0x8d/0x2e0 path_mount+0x50f/0x1e50 do_mount+0x107/0x130 __se_sys_mount+0x1c5/0x2f0 __x64_sys_mount+0xc7/0x160 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Allocated by task 719: kasan_save_stack+0x23/0x60 __kasan_kmalloc.constprop.0+0x10b/0x120 kasan_slab_alloc+0x12/0x20 kmem_cache_alloc+0x1c0/0x870 jffs2_alloc_xattr_ref+0x2f/0xa0 jffs2_scan_medium.cold+0x3713/0x4794 jffs2_do_mount_fs.cold+0xa7/0x2253 jffs2_do_fill_super+0x383/0xc30 jffs2_fill_super+0x2ea/0x4c0 [...] Freed by task 719: kmem_cache_free+0xcc/0x7b0 jffs2_free_xattr_ref+0x78/0x98 jffs2_clear_xattr_subsystem+0xa1/0x6ac jffs2_do_mount_fs.cold+0x5e6/0x2253 jffs2_do_fill_super+0x383/0xc30 jffs2_fill_super+0x2ea/0x4c0 [...] The buggy address belongs to the object at ffff8881243384b8 which belongs to the cache jffs2_xattr_ref of size 48 The buggy address is located 40 bytes inside of 48-byte region [ffff8881243384b8, ffff8881243384e8) [...] ================================================================== The triggering of the BUG is shown in the following stack: ----------------------------------------------------------- jffs2_fill_super jffs2_do_fill_super jffs2_do_mount_fs jffs2_build_filesystem jffs2_scan_medium jffs2_scan_eraseblock <--- ERROR jffs2_clear_xattr_subsystem <--- free jffs2_clear_xattr_subsystem <--- free again ----------------------------------------------------------- An error is returned in jffs2_do_mount_fs(). If the error is returned by jffs2_sum_init(), the jffs2_clear_xattr_subsystem() does not need to be executed. If the error is returned by jffs2_build_filesystem(), the jffs2_clear_xattr_subsystem() also does not need to be executed again. So move jffs2_clear_xattr_subsystem() from 'out_inohash' to 'out_root' to fix this UAF problem. Fixes: aa98d7cf59b5 ("[JFFS2][XATTR] XATTR support on JFFS2 (version. 5)") Cc: stable@vger.kernel.org Reported-by: Hulk Robot Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger --- fs/jffs2/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2ac410477c4f..71f03a5d36ed 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -603,8 +603,8 @@ out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); kvfree(c->blocks); - out_inohash: jffs2_clear_xattr_subsystem(c); + out_inohash: kfree(c->inocache_list); out_wbuf: jffs2_flash_cleanup(c); -- cgit v1.2.3-59-g8ed1b From d051cef784de4d54835f6b6836d98a8f6935772c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 14 Jan 2022 18:28:53 +0800 Subject: jffs2: fix memory leak in jffs2_do_mount_fs If jffs2_build_filesystem() in jffs2_do_mount_fs() returns an error, we can observe the following kmemleak report: -------------------------------------------- unreferenced object 0xffff88811b25a640 (size 64): comm "mount", pid 691, jiffies 4294957728 (age 71.952s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_init+0x86/0x130 [] jffs2_do_mount_fs+0x798/0xac0 [] jffs2_do_fill_super+0x383/0xc30 [] jffs2_fill_super+0x2ea/0x4c0 [...] unreferenced object 0xffff88812c760000 (size 65536): comm "mount", pid 691, jiffies 4294957728 (age 71.952s) hex dump (first 32 bytes): bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ backtrace: [] __kmalloc+0x6b9/0x910 [] jffs2_sum_init+0xd7/0x130 [] jffs2_do_mount_fs+0x798/0xac0 [] jffs2_do_fill_super+0x383/0xc30 [] jffs2_fill_super+0x2ea/0x4c0 [...] -------------------------------------------- This is because the resources allocated in jffs2_sum_init() are not released. Call jffs2_sum_exit() to release these resources to solve the problem. Fixes: e631ddba5887 ("[JFFS2] Add erase block summary support (mount time improvement)") Cc: stable@vger.kernel.org Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger --- fs/jffs2/build.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index b288c8ae1236..837cd55fd4c5 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); ret = -EIO; - goto out_free; + goto out_sum_exit; } jffs2_calc_trigger_levels(c); return 0; + out_sum_exit: + jffs2_sum_exit(c); out_free: kvfree(c->blocks); -- cgit v1.2.3-59-g8ed1b From 9cdd3128874f5fe759e2c4e1360ab7fb96a8d1df Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 14 Jan 2022 18:28:54 +0800 Subject: jffs2: fix memory leak in jffs2_scan_medium If an error is returned in jffs2_scan_eraseblock() and some memory has been added to the jffs2_summary *s, we can observe the following kmemleak report: -------------------------------------------- unreferenced object 0xffff88812b889c40 (size 64): comm "mount", pid 692, jiffies 4294838325 (age 34.288s) hex dump (first 32 bytes): 40 48 b5 14 81 88 ff ff 01 e0 31 00 00 00 50 00 @H........1...P. 00 00 01 00 00 00 01 00 00 00 02 00 00 00 09 08 ................ backtrace: [] __kmalloc+0x613/0x910 [] jffs2_sum_add_dirent_mem+0x5c/0xa0 [] jffs2_scan_medium.cold+0x36e5/0x4794 [] jffs2_do_mount_fs.cold+0xa7/0x2267 [] jffs2_do_fill_super+0x383/0xc30 [] jffs2_fill_super+0x2ea/0x4c0 [] mtd_get_sb+0x254/0x400 [] mtd_get_sb_by_nr+0x4f/0xd0 [] get_tree_mtd+0x498/0x840 [] jffs2_get_tree+0x25/0x30 [] vfs_get_tree+0x8d/0x2e0 [] path_mount+0x50f/0x1e50 [] do_mount+0x107/0x130 [] __se_sys_mount+0x1c5/0x2f0 [] __x64_sys_mount+0xc7/0x160 [] do_syscall_64+0x45/0x70 unreferenced object 0xffff888114b54840 (size 32): comm "mount", pid 692, jiffies 4294838325 (age 34.288s) hex dump (first 32 bytes): c0 75 b5 14 81 88 ff ff 02 e0 02 00 00 00 02 00 .u.............. 00 00 84 00 00 00 44 00 00 00 6b 6b 6b 6b 6b a5 ......D...kkkkk. backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_add_inode_mem+0x54/0x90 [] jffs2_scan_medium.cold+0x4481/0x4794 [...] unreferenced object 0xffff888114b57280 (size 32): comm "mount", pid 692, jiffies 4294838393 (age 34.357s) hex dump (first 32 bytes): 10 d5 6c 11 81 88 ff ff 08 e0 05 00 00 00 01 00 ..l............. 00 00 38 02 00 00 28 00 00 00 6b 6b 6b 6b 6b a5 ..8...(...kkkkk. backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_add_xattr_mem+0x54/0x90 [] jffs2_scan_medium.cold+0x298c/0x4794 [...] unreferenced object 0xffff8881116cd510 (size 16): comm "mount", pid 692, jiffies 4294838395 (age 34.355s) hex dump (first 16 bytes): 00 00 00 00 00 00 00 00 09 e0 60 02 00 00 6b a5 ..........`...k. backtrace: [] kmem_cache_alloc_trace+0x584/0x880 [] jffs2_sum_add_xref_mem+0x54/0x90 [] jffs2_scan_medium.cold+0x3a20/0x4794 [...] -------------------------------------------- Therefore, we should call jffs2_sum_reset_collected(s) on exit to release the memory added in s. In addition, a new tag "out_buf" is added to prevent the NULL pointer reference caused by s being NULL. (thanks to Zhang Yi for this analysis) Fixes: e631ddba5887 ("[JFFS2] Add erase block summary support (mount time improvement)") Cc: stable@vger.kernel.org Co-developed-with: Zhihao Cheng Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger --- fs/jffs2/scan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index b676056826be..29671e33a171 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) if (!s) { JFFS2_WARNING("Can't allocate memory for summary\n"); ret = -ENOMEM; - goto out; + goto out_buf; } } @@ -275,13 +275,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } ret = 0; out: + jffs2_sum_reset_collected(s); + kfree(s); + out_buf: if (buf_size) kfree(flashbuf); #ifndef __ECOS else mtd_unpoint(c->mtd, 0, c->mtd->size); #endif - kfree(s); return ret; } -- cgit v1.2.3-59-g8ed1b From 705757274599e2e064dd3054aabc74e8af31a095 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 15 Feb 2022 12:07:36 +0800 Subject: ubifs: rename_whiteout: correct old_dir size computing When renaming the whiteout file, the old whiteout file is not deleted. Therefore, we add the old dentry size to the old dir like XFS. Otherwise, an error may be reported due to `fscki->calc_sz != fscki->size` in check_indes. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Reported-by: Zhihao Cheng Signed-off-by: Baokun Li Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ae082a0be2a3..86151889548e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1402,6 +1402,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, iput(whiteout); goto out_release; } + + /* Add the old_dentry size to the old_dir size. */ + old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); -- cgit v1.2.3-59-g8ed1b