From 834224e81cdc265456f73fed748a349e43e2d8ef Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:00 +0000 Subject: ext4: convert i_fc_lock to spinlock Convert ext4_inode_info->i_fc_lock to spinlock to avoid sleeping in invalid contexts. Reviewed-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://patch.msgid.link/20250508175908.1004880-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 +++++-- fs/ext4/fast_commit.c | 19 +++++++++---------- fs/ext4/super.c | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5a20e9cd7184..79dfb57a7046 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1069,8 +1069,11 @@ struct ext4_inode_info { /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; - /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ - struct mutex i_fc_lock; + /* + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len + * and inode's EXT4_FC_STATE_COMMITTING state bit. + */ + spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index da4263a14a20..63859ec6d91d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -385,7 +385,7 @@ static int ext4_fc_track_template( int ret; tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { @@ -393,8 +393,7 @@ static int ext4_fc_track_template( ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; @@ -428,19 +427,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -471,7 +470,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -893,15 +892,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 181934499624..ed8166fe2ad0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1415,7 +1415,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); - mutex_init(&ei->i_fc_lock); + spin_lock_init(&ei->i_fc_lock); return &ei->vfs_inode; } -- cgit v1.2.3-59-g8ed1b From 4d3266463ed06af2916b306bdb0ebd647726ba49 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:01 +0000 Subject: ext4: for committing inode, make ext4_fc_track_inode wait If the inode that's being requested to track using ext4_fc_track_inode is being committed, then wait until the inode finishes the commit. Also, add calls to ext4_fc_track_inode at the right places. With this patch, now calling ext4_reserve_inode_write() results in inode being tracked for next fast commit. This ensures that by the time ext4_reserve_inode_write() returns, it is ready to be modified and won't be committed until the corresponding handle is open. A subtle lock ordering requirement with i_data_sem (which is documented in the code) requires that ext4_fc_track_inode() be called before grabbing i_data_sem. So, this patch also adds explicit ext4_fc_track_inode() calls in places where i_data_sem grabbed. Signed-off-by: Harshad Shirwadkar Link: https://patch.msgid.link/20250508175908.1004880-3-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 35 +++++++++++++++++++++++++++++++++++ fs/ext4/inline.c | 1 + fs/ext4/inode.c | 5 +++++ 3 files changed, 41 insertions(+) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 63859ec6d91d..c4d3c71d5e6c 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include /* * Ext4 Fast Commits * ----------------- @@ -570,6 +571,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -587,6 +590,38 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + if (!list_empty(&ei->i_fc_list)) + return; + + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2c9b762925c7..fbc1c84b555c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,6 +601,7 @@ retry: goto out; } + ext4_fc_track_inode(handle, inode); ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 94c7d2d828a6..d58b99407390 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -696,6 +696,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; + + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -4072,6 +4074,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -4224,6 +4227,7 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; + ext4_fc_track_inode(handle, inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -5895,6 +5899,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; -- cgit v1.2.3-59-g8ed1b From 0b64fd74dd2054834c98867422bc9813e3bba8f4 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:02 +0000 Subject: ext4: mark inode dirty before grabbing i_data_sem in ext4_setattr Mark inode dirty first and then grab i_data_sem in ext4_setattr(). Signed-off-by: Harshad Shirwadkar Link: https://patch.msgid.link/20250508175908.1004880-4-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d58b99407390..3005053e92a7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5568,9 +5568,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5581,6 +5579,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; -- cgit v1.2.3-59-g8ed1b From 857d32f2618166765ce9306a246d0745afc76859 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:03 +0000 Subject: ext4: rework fast commit commit path This patch reworks fast commit's commit path to remove locking the journal for the entire duration of a fast commit. Instead, we only lock the journal while marking all the eligible inodes as "committing". This allows handles to make progress in parallel with the fast commit. Signed-off-by: Harshad Shirwadkar Link: https://patch.msgid.link/20250508175908.1004880-5-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/fast_commit.c | 199 +++++++++++++++++++++++++++++++------------------- fs/jbd2/journal.c | 2 - 3 files changed, 126 insertions(+), 76 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 79dfb57a7046..493d9ac7a577 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1916,6 +1916,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index c4d3c71d5e6c..a2cb4d965dc1 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -287,24 +287,55 @@ void ext4_fc_del(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; -restart: spin_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { spin_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + spin_unlock(&sbi->s_fc_lock); + schedule(); + spin_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC @@ -325,8 +356,6 @@ restart: release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -590,9 +619,6 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; - if (!list_empty(&ei->i_fc_list)) - return; - /* * If we come here, we may sleep while waiting for the inode to * commit. We shouldn't be holding i_data_sem when we go to sleep since @@ -988,61 +1014,25 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } @@ -1123,26 +1113,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + spin_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + spin_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + spin_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + spin_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + spin_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + spin_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1154,6 +1199,7 @@ static int ext4_fc_perform_commit(journal_t *journal) } } + /* Step 6.2: Now write all the dentry updates. */ spin_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { @@ -1161,6 +1207,7 @@ static int ext4_fc_perform_commit(journal_t *journal) goto out; } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) @@ -1173,10 +1220,8 @@ static int ext4_fc_perform_commit(journal_t *journal) ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: @@ -1298,7 +1343,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) @@ -1308,13 +1353,15 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, iter->i_sync_tid)) { - ext4_fc_reset_inode(&iter->vfs_inode); + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been @@ -1325,15 +1372,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ - list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, + list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 743a1d7633cd..bfaa14bb1049 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -728,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); - jbd2_journal_lock_updates(journal); return 0; } @@ -742,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); - jbd2_journal_unlock_updates(journal); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) -- cgit v1.2.3-59-g8ed1b From ed45d331135c317c7f80e8c4e0dad644ca8ca9dc Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:04 +0000 Subject: ext4: drop i_fc_updates from inode fc info The new logic introduced in this series does not require tracking number of active handles open on an inode. So, drop it. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ---- fs/ext4/fast_commit.c | 68 --------------------------------------------------- 2 files changed, 73 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 493d9ac7a577..0cb34a06ee4c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1061,9 +1061,6 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; - /* Number of ongoing updates on this inode */ - atomic_t i_fc_updates; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ @@ -2926,8 +2923,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); -void ext4_fc_start_update(struct inode *inode); -void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index a2cb4d965dc1..f2e8a5f22260 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -202,32 +202,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); -} - -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) -{ - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) @@ -236,48 +210,6 @@ static bool ext4_fc_disabled(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } -/* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); -} - /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. -- cgit v1.2.3-59-g8ed1b From 69f35ca189300ddba29a16214159beef45bbd984 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:05 +0000 Subject: ext4: update code documentation This patch updates code documentation to reflect the commit path changes made in this series. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara code docs Link: https://patch.msgid.link/20250508175908.1004880-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f2e8a5f22260..06dda39326b4 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -50,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -143,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -157,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include -- cgit v1.2.3-59-g8ed1b From 86e07d4b9b0497afef78af773c74258c8f63030f Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:06 +0000 Subject: ext4: temporarily elevate commit thread priority Unlike JBD2 based full commits, there is no dedicated journal thread for fast commits. Thus to reduce scheduling delays between IO submission and completion, temporarily elevate the committer thread's priority to match the configured priority of the JBD2 journal thread. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-8-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- fs/ext4/fast_commit.c | 13 +++++++++++++ fs/ext4/super.c | 5 ++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0cb34a06ee4c..3987c5bf2ff9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2296,10 +2296,12 @@ static inline int ext4_emergency_state(struct super_block *sb) #define EXT4_DEFM_NODELALLOC 0x0800 /* - * Default journal batch times + * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + /* * Default values for superblock update diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 06dda39326b4..5f6a8ec249b9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1216,6 +1216,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); @@ -1223,6 +1224,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); + old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1253,6 +1255,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1267,6 +1278,7 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time @@ -1276,6 +1288,7 @@ restart_fc: return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ed8166fe2ad0..356a96269a21 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1809,7 +1809,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { {} }; -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 @@ -5255,7 +5254,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -6495,7 +6494,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } -- cgit v1.2.3-59-g8ed1b From 12e64e7f859ed19c5bb497866284d0318c3194a2 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:07 +0000 Subject: ext4: convert s_fc_lock to mutex type This allows us to hold s_fc_lock during kmem_cache_* functions, which is needed in the following patch. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-9-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/fast_commit.c | 60 +++++++++++++++++++++++++-------------------------- fs/ext4/super.c | 2 +- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3987c5bf2ff9..052d7afeefaf 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1754,7 +1754,7 @@ struct ext4_sb_info { * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ - spinlock_t s_fc_lock; + struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 5f6a8ec249b9..eb888e52261f 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -238,9 +238,9 @@ void ext4_fc_del(struct inode *inode) if (ext4_fc_disabled(inode->i_sb)) return; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -275,9 +275,9 @@ void ext4_fc_del(struct inode *inode) #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); schedule(); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); } finish_wait(wq, &wait.wq_entry); } @@ -288,7 +288,7 @@ void ext4_fc_del(struct inode *inode) * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -298,7 +298,7 @@ void ext4_fc_del(struct inode *inode) list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); @@ -329,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -373,14 +373,14 @@ static int ext4_fc_track_template( if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -424,7 +424,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -445,7 +445,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); spin_lock(&ei->i_fc_lock); return 0; @@ -1000,12 +1000,12 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); continue; } /* @@ -1018,7 +1018,7 @@ __releases(&sbi->s_fc_lock) inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); /* * We first write the inode and then the create dirent. This @@ -1040,11 +1040,11 @@ __releases(&sbi->s_fc_lock) goto lock_and_exit; } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); } return 0; lock_and_exit: - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); return ret; } @@ -1064,12 +1064,12 @@ static int ext4_fc_perform_commit(journal_t *journal) * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being * freed until the data flush is over. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); } - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); /* Step 2: Flush data for all the eligible inodes. */ ret = ext4_fc_flush_data(journal); @@ -1079,7 +1079,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * any error from step 2. This ensures that waiters waiting on * EXT4_STATE_FC_FLUSHING_DATA can resume. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); @@ -1096,7 +1096,7 @@ static int ext4_fc_perform_commit(journal_t *journal) * prepare_to_wait() in ext4_fc_del(). */ smp_mb(); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); /* * If we encountered error in Step 2, return it now after clearing @@ -1113,12 +1113,12 @@ static int ext4_fc_perform_commit(journal_t *journal) * previous handles are now drained. We now mark the inodes on the * commit queue as being committed. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); } - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); jbd2_journal_unlock_updates(journal); /* @@ -1146,10 +1146,10 @@ static int ext4_fc_perform_commit(journal_t *journal) } /* Step 6.2: Now write all the dentry updates. */ - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); goto out; } @@ -1159,7 +1159,7 @@ static int ext4_fc_perform_commit(journal_t *journal) if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; @@ -1311,7 +1311,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info, @@ -1353,11 +1353,11 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1372,7 +1372,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 356a96269a21..5bd81dd9751c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4481,7 +4481,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); + mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; -- cgit v1.2.3-59-g8ed1b From 6593714d67bab860a733d07895a94404f4ac3039 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 8 May 2025 17:59:08 +0000 Subject: ext4: hold s_fc_lock while during fast commit Leaving s_fc_lock in between during commit in ext4_fc_perform_commit() function leaves room for subtle concurrency bugs where ext4_fc_del() may delete an inode from the fast commit list, leaving list in an inconsistent state. Signed-off-by: Harshad Shirwadkar Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250508175908.1004880-10-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 44 +++++++++++++------------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index eb888e52261f..7ac672e35f08 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -424,6 +424,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); + INIT_LIST_HEAD(&node->fcd_list); mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) @@ -985,8 +986,6 @@ static int ext4_fc_flush_data(journal_t *journal) /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1000,26 +999,22 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - mutex_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - mutex_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the - * corresponding inode pointer + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - WARN_ON(list_empty(&fc_dentry->fcd_dilist)); + if (list_empty(&fc_dentry->fcd_dilist)) + continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - mutex_unlock(&sbi->s_fc_lock); - /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first @@ -1029,23 +1024,14 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - mutex_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - mutex_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) @@ -1148,10 +1134,8 @@ static int ext4_fc_perform_commit(journal_t *journal) /* Step 6.2: Now write all the dentry updates. */ mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - mutex_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { @@ -1159,7 +1143,6 @@ static int ext4_fc_perform_commit(journal_t *journal) if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - mutex_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; @@ -1171,6 +1154,7 @@ static int ext4_fc_perform_commit(journal_t *journal) ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } @@ -1353,11 +1337,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - mutex_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], -- cgit v1.2.3-59-g8ed1b From 227cb4ca5a6502164f850d22aec3104d7888b270 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 15 Apr 2025 11:53:04 -0300 Subject: ext4: inline: fix len overflow in ext4_prepare_inline_data When running the following code on an ext4 filesystem with inline_data feature enabled, it will lead to the bug below. fd = open("file1", O_RDWR | O_CREAT | O_TRUNC, 0666); ftruncate(fd, 30); pwrite(fd, "a", 1, (1UL << 40) + 5UL); That happens because write_begin will succeed as when ext4_generic_write_inline_data calls ext4_prepare_inline_data, pos + len will be truncated, leading to ext4_prepare_inline_data parameter to be 6 instead of 0x10000000006. Then, later when write_end is called, we hit: BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); at ext4_write_inline_data. Fix it by using a loff_t type for the len parameter in ext4_prepare_inline_data instead of an unsigned int. [ 44.545164] ------------[ cut here ]------------ [ 44.545530] kernel BUG at fs/ext4/inline.c:240! [ 44.545834] Oops: invalid opcode: 0000 [#1] SMP NOPTI [ 44.546172] CPU: 3 UID: 0 PID: 343 Comm: test Not tainted 6.15.0-rc2-00003-g9080916f4863 #45 PREEMPT(full) 112853fcebfdb93254270a7959841d2c6aa2c8bb [ 44.546523] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 44.546523] RIP: 0010:ext4_write_inline_data+0xfe/0x100 [ 44.546523] Code: 3c 0e 48 83 c7 48 48 89 de 5b 41 5c 41 5d 41 5e 41 5f 5d e9 e4 fa 43 01 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc cc 0f 0b <0f> 0b 0f 1f 44 00 00 55 41 57 41 56 41 55 41 54 53 48 83 ec 20 49 [ 44.546523] RSP: 0018:ffffb342008b79a8 EFLAGS: 00010216 [ 44.546523] RAX: 0000000000000001 RBX: ffff9329c579c000 RCX: 0000010000000006 [ 44.546523] RDX: 000000000000003c RSI: ffffb342008b79f0 RDI: ffff9329c158e738 [ 44.546523] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000 [ 44.546523] R10: 00007ffffffff000 R11: ffffffff9bd0d910 R12: 0000006210000000 [ 44.546523] R13: fffffc7e4015e700 R14: 0000010000000005 R15: ffff9329c158e738 [ 44.546523] FS: 00007f4299934740(0000) GS:ffff932a60179000(0000) knlGS:0000000000000000 [ 44.546523] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 44.546523] CR2: 00007f4299a1ec90 CR3: 0000000002886002 CR4: 0000000000770eb0 [ 44.546523] PKRU: 55555554 [ 44.546523] Call Trace: [ 44.546523] [ 44.546523] ext4_write_inline_data_end+0x126/0x2d0 [ 44.546523] generic_perform_write+0x17e/0x270 [ 44.546523] ext4_buffered_write_iter+0xc8/0x170 [ 44.546523] vfs_write+0x2be/0x3e0 [ 44.546523] __x64_sys_pwrite64+0x6d/0xc0 [ 44.546523] do_syscall_64+0x6a/0xf0 [ 44.546523] ? __wake_up+0x89/0xb0 [ 44.546523] ? xas_find+0x72/0x1c0 [ 44.546523] ? next_uptodate_folio+0x317/0x330 [ 44.546523] ? set_pte_range+0x1a6/0x270 [ 44.546523] ? filemap_map_pages+0x6ee/0x840 [ 44.546523] ? ext4_setattr+0x2fa/0x750 [ 44.546523] ? do_pte_missing+0x128/0xf70 [ 44.546523] ? security_inode_post_setattr+0x3e/0xd0 [ 44.546523] ? ___pte_offset_map+0x19/0x100 [ 44.546523] ? handle_mm_fault+0x721/0xa10 [ 44.546523] ? do_user_addr_fault+0x197/0x730 [ 44.546523] ? do_syscall_64+0x76/0xf0 [ 44.546523] ? arch_exit_to_user_mode_prepare+0x1e/0x60 [ 44.546523] ? irqentry_exit_to_user_mode+0x79/0x90 [ 44.546523] entry_SYSCALL_64_after_hwframe+0x55/0x5d [ 44.546523] RIP: 0033:0x7f42999c6687 [ 44.546523] Code: 48 89 fa 4c 89 df e8 58 b3 00 00 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 1a 5b c3 0f 1f 84 00 00 00 00 00 48 8b 44 24 10 0f 05 <5b> c3 0f 1f 80 00 00 00 00 83 e2 39 83 fa 08 75 de e8 23 ff ff ff [ 44.546523] RSP: 002b:00007ffeae4a7930 EFLAGS: 00000202 ORIG_RAX: 0000000000000012 [ 44.546523] RAX: ffffffffffffffda RBX: 00007f4299934740 RCX: 00007f42999c6687 [ 44.546523] RDX: 0000000000000001 RSI: 000055ea6149200f RDI: 0000000000000003 [ 44.546523] RBP: 00007ffeae4a79a0 R08: 0000000000000000 R09: 0000000000000000 [ 44.546523] R10: 0000010000000005 R11: 0000000000000202 R12: 0000000000000000 [ 44.546523] R13: 00007ffeae4a7ac8 R14: 00007f4299b86000 R15: 000055ea61493dd8 [ 44.546523] [ 44.546523] Modules linked in: [ 44.568501] ---[ end trace 0000000000000000 ]--- [ 44.568889] RIP: 0010:ext4_write_inline_data+0xfe/0x100 [ 44.569328] Code: 3c 0e 48 83 c7 48 48 89 de 5b 41 5c 41 5d 41 5e 41 5f 5d e9 e4 fa 43 01 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc cc 0f 0b <0f> 0b 0f 1f 44 00 00 55 41 57 41 56 41 55 41 54 53 48 83 ec 20 49 [ 44.570931] RSP: 0018:ffffb342008b79a8 EFLAGS: 00010216 [ 44.571356] RAX: 0000000000000001 RBX: ffff9329c579c000 RCX: 0000010000000006 [ 44.571959] RDX: 000000000000003c RSI: ffffb342008b79f0 RDI: ffff9329c158e738 [ 44.572571] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000 [ 44.573148] R10: 00007ffffffff000 R11: ffffffff9bd0d910 R12: 0000006210000000 [ 44.573748] R13: fffffc7e4015e700 R14: 0000010000000005 R15: ffff9329c158e738 [ 44.574335] FS: 00007f4299934740(0000) GS:ffff932a60179000(0000) knlGS:0000000000000000 [ 44.575027] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 44.575520] CR2: 00007f4299a1ec90 CR3: 0000000002886002 CR4: 0000000000770eb0 [ 44.576112] PKRU: 55555554 [ 44.576338] Kernel panic - not syncing: Fatal exception [ 44.576517] Kernel Offset: 0x1a600000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Reported-by: syzbot+fe2a25dae02a207717a0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=fe2a25dae02a207717a0 Fixes: f19d5870cbf7 ("ext4: add normal write support for inline data") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: stable@vger.kernel.org Reviewed-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://patch.msgid.link/20250415-ext4-prepare-inline-overflow-v1-1-f4c13d900967@igalia.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index fbc1c84b555c..a1bbcdf40824 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -397,7 +397,7 @@ out: } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) + loff_t len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); -- cgit v1.2.3-59-g8ed1b From 53ce42accd2002cc490fc86000ac532530507a74 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:49 +0800 Subject: ext4: ext4: unify EXT4_EX_NOCACHE|NOFAIL flags in ext4_ext_remove_space() When removing space, we should use EXT4_EX_NOCACHE because we don't need to cache extents, and we should also use EXT4_EX_NOFAIL to prevent metadata inconsistencies that may arise from memory allocation failures. While ext4_ext_remove_space() already uses these two flags in most places, they are missing in ext4_ext_search_right() and read_extent_tree_block() calls. Unify the flags to ensure consistent behavior throughout the extent removal process. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c616a16a9f36..d8eac736cc9a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1530,7 +1530,7 @@ static int ext4_ext_search_left(struct inode *inode, static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent *ret_ex) + struct ext4_extent *ret_ex, int flags) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1604,7 +1604,8 @@ got_index: ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, + flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -1612,7 +1613,7 @@ got_index: put_bh(bh); } - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -2821,6 +2822,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; partial.pclu = 0; partial.lblk = 0; @@ -2851,8 +2853,7 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, - EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, end, NULL, flags); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2918,7 +2919,7 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - NULL); + NULL, flags); if (err < 0) goto out; if (pblk) { @@ -2994,8 +2995,7 @@ again: i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, - depth - i - 1, - EXT4_EX_NOCACHE); + depth - i - 1, flags); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); @@ -4314,7 +4314,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, + &ex2, 0); if (err < 0) goto out; -- cgit v1.2.3-59-g8ed1b From 86b349ce0312a397a6961e457108556e44a3d211 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:50 +0800 Subject: ext4: generalize EXT4_GET_BLOCKS_IO_SUBMIT flag usage Currently, the EXT4_GET_BLOCKS_IO_SUBMIT flag is only used during data writeback to indicate that in ordered mode, the journal commit thread should skip re-submitting data and simply wait for I/O completion. To prepare for later patches that need to detect I/O submission context in ext4_map_blocks(), generalizes the meaning of EXT4_GET_BLOCKS_IO_SUBMIT. This flag will be set during: 1) data I/O writeback, 2) I/O completion extents conversion, 3) journal performing commit in fast_commit. This change doesn't affect current usage of this flag and provides a clear way to identify I/O submission context. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 13 ++++++++----- fs/ext4/fast_commit.c | 3 ++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 052d7afeefaf..84cbe8024f1e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -706,9 +706,6 @@ enum { #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -720,9 +717,15 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) - /* Caller will submit data before dropping transaction handle. This - * allows jbd2 to avoid submitting data before commit. */ + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7ac672e35f08..bfe5b3c40078 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -917,7 +917,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT); if (ret < 0) return -ECANCELED; -- cgit v1.2.3-59-g8ed1b From 402e38e6b71f5739119ca3107f375e112d63c7c5 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:51 +0800 Subject: ext4: prevent stale extent cache entries caused by concurrent I/O writeback Currently, in the I/O writeback path, ext4_map_blocks() may attempt to cache additional unrelated extents in the extent status tree without holding the inode's i_rwsem and the mapping's invalidate_lock. This can lead to stale extent status entries remaining in certain scenarios, potentially causing data corruption. For example, when performing a collapse range in ext4_collapse_range(), it clears the extent cache and dirty pages before removing blocks and shifting extents. It also holds the i_data_sem during these two operations. However, both ext4_ext_remove_space() and ext4_ext_shift_extents() may briefly release the i_data_sem if journal credits are insufficient (ext4_datasem_ensure_credits()). If another writeback process writes dirty pages from other regions during this interval, it may cache extents that are about to be modified. Unless ext4_collapse_range() explicitly clears the extent cache again, these cached entries can become stale and inconsistent with the actual extents. 0 a n b c m | | | | | | [www][wwwwww][wwwwwwww]...[wwwww][wwww]... | | N M Assume that block a is dirty. The collapse range operation is removing data from n to m and drops i_data_sem immediately after removing the extent from b to c. At the same time, a concurrent writeback begins to write back block a; it will reloads the extent from [n, b) into the extent status tree since it does not hold the i_rwsem or the invalidate_lock. After the collapse range operation, it left the stale extent [n, b), which points logical block n to N, but the actual physical block of n should be M. Similarly, both ext4_insert_range() and ext4_truncate() have the same problem. ext4_punch_hole() survived since it re-add a hole extent entry after removing space since commit 9f1118223aa0 ("ext4: add a hole extent entry in cache after punch"). In most cases, during dirty page writeback, the block mapping information is likely to be found in the extent cache, making it less necessary to search for physical extents. Consequently, loading unrelated extent caches during writeback appears to be ineffective. Therefore, fix this by adds EXT4_EX_NOCACHE in the writeback path to prevent caching of unrelated extents, eliminating this potential source of corruption. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 12 +++++++++--- fs/ext4/fast_commit.c | 3 ++- fs/ext4/inode.c | 28 ++++++++++++++++++++-------- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 84cbe8024f1e..3e724def1b06 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -741,6 +741,7 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 +#define EXT4_EX_FILTER 0x70000000 /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d8eac736cc9a..8a5724b2dc51 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4202,7 +4202,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4315,7 +4315,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out; ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, - &ex2, 0); + &ex2, flags); if (err < 0) goto out; @@ -4820,8 +4820,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, break; } } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index bfe5b3c40078..1392241de5e6 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -918,7 +918,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, - EXT4_GET_BLOCKS_IO_SUBMIT); + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3005053e92a7..8c0d6fa58f26 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -463,15 +463,16 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map) + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + flags &= EXT4_EX_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; @@ -622,6 +623,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Do not allow caching of unrelated ranges of extents during I/O + * submission. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { @@ -667,7 +675,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - retval = ext4_map_query_blocks(handle, inode, map); + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -1807,7 +1815,7 @@ found: if (ext4_has_inline_data(inode)) retval = 0; else - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; @@ -1830,7 +1838,7 @@ add_delayed: goto found; } } else if (!ext4_has_inline_data(inode)) { - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; @@ -2214,11 +2222,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; -- cgit v1.2.3-59-g8ed1b From 151ff9325e5e17c97839a00b740726656b04647b Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:52 +0800 Subject: ext4: prevent stale extent cache entries caused by concurrent fiemap The ext4_fiemap() currently invokes ext4_ext_precache() and iomap_fiemap() to preload the extent cache and query mapping information without holding the inode's i_rwsem. This can result in stale extent cache entries when competing with operations such as ext4_collapse_range() which calls ext4_ext_remove_space() or ext4_ext_shift_extents(). The problem arises when ext4_ext_remove_space() temporarily releases i_data_sem due to insufficient journal credits. During this interval, a concurrent ext4_fiemap() may cache extent entries that are about to be deleted. As a result, these cached entries become stale and inconsistent with the actual extents. Loading the extents cache without holding the inode's i_rwsem or the mapping's invalidate_lock is not permitted besides during the writeback. Fix this by holding the i_rwsem in ext4_fiemap(). Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8a5724b2dc51..3adf05fbdd59 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4963,10 +4963,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int error = 0; + inode_lock_shared(inode); if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) - return error; + goto unlock; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } @@ -4977,15 +4978,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) - return error; + goto unlock; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - return iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } - - return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); +unlock: + inode_unlock_shared(inode); + return error; } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, -- cgit v1.2.3-59-g8ed1b From f22a0ef2231a7d8374bb021eb86404d0e9de5a02 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:53 +0800 Subject: ext4: prevent stale extent cache entries caused by concurrent get es_cache The EXT4_IOC_GET_ES_CACHE and EXT4_IOC_PRECACHE_EXTENTS currently invokes ext4_ext_precache() to preload the extent cache without holding the inode's i_rwsem. This can result in stale extent cache entries when competing with operations such as ext4_collapse_range() which calls ext4_ext_remove_space() or ext4_ext_shift_extents(). The problem arises when ext4_ext_remove_space() temporarily releases i_data_sem due to insufficient journal credits. During this interval, a concurrent EXT4_IOC_GET_ES_CACHE or EXT4_IOC_PRECACHE_EXTENTS may cache extent entries that are about to be deleted. As a result, these cached entries become stale and inconsistent with the actual extents. Loading the extents cache without holding the inode's i_rwsem or the mapping's invalidate_lock is not permitted besides during the writeback. Fix this by holding the i_rwsem during EXT4_IOC_GET_ES_CACHE and EXT4_IOC_PRECACHE_EXTENTS. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 ++ fs/ext4/ioctl.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3adf05fbdd59..b5eb89ef7ae2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5011,7 +5011,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + inode_lock_shared(inode); error = ext4_ext_precache(inode); + inode_unlock_shared(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d17207386ead..0e240013c84d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1505,8 +1505,14 @@ resizefs_out: return 0; } case EXT4_IOC_PRECACHE_EXTENTS: - return ext4_ext_precache(inode); + { + int ret; + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; -- cgit v1.2.3-59-g8ed1b From 0b8e0bd45007d5740391e658c2581bd614207387 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:54 +0800 Subject: ext4: factor out is_special_ino() Factor out the helper is_special_ino() to facilitate the checking of special inodes in the subsequent patches. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 +++++++++++ fs/ext4/inode.c | 7 +------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3e724def1b06..218a1e61547a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3055,6 +3055,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +static inline bool is_special_ino(struct super_block *sb, unsigned long ino) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum); +} + /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8c0d6fa58f26..64cadc46e420 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4797,12 +4797,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) -- cgit v1.2.3-59-g8ed1b From 7871da20d484d5c7e536bfd52845b6be4488ff30 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:55 +0800 Subject: ext4: introduce ext4_check_map_extents_env() debug helper Loading and modifying the extents tree and extent status tree without holding the inode's i_rwsem or the mapping's invalidate_lock is not permitted, except during the I/O writeback. Add a new debug helper ext4_check_map_extents_env(), it will verify whether the extent loading/modifying context is safe. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 218a1e61547a..3f352f2a6d85 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2978,6 +2978,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr) void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); +void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 64cadc46e420..5b591a47ff84 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -416,6 +416,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) -- cgit v1.2.3-59-g8ed1b From 1b4d2a0b794669e54914e3f429c08e49ea40b40c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:56 +0800 Subject: ext4: check env when mapping and modifying extents Add ext4_check_map_extents_env() to the places where loading extents, mapping blocks, removing blocks, and modifying extents, excluding the I/O writeback context. This function will verify whether the locking mechanisms in place are adequate. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-9-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 6 ++++++ fs/ext4/inode.c | 13 ++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b5eb89ef7ae2..52bfc042bf4e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -611,6 +611,8 @@ int ext4_ext_precache(struct inode *inode) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ + ext4_check_map_extents_env(inode); + down_read(&ei->i_data_sem); depth = ext_depth(inode); @@ -5342,6 +5344,8 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) start_lblk = offset >> inode->i_blkbits; end_lblk = (offset + len) >> inode->i_blkbits; + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); @@ -5443,6 +5447,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) start_lblk = offset >> inode->i_blkbits; len_lblk = len >> inode->i_blkbits; + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b591a47ff84..9b0bfbbdb50b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -650,11 +650,14 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return -EFSCORRUPTED; /* - * Do not allow caching of unrelated ranges of extents during I/O - * submission. + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. */ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && @@ -1801,6 +1804,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, @@ -4113,6 +4118,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ext4_lblk_t hole_len = end_lblk - start_lblk; ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -4266,8 +4272,9 @@ int ext4_truncate(struct inode *inode) goto out_stop; ext4_fc_track_inode(handle, inode); - down_write(&EXT4_I(inode)->i_data_sem); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) -- cgit v1.2.3-59-g8ed1b From 24b7a2331fcdf6de103ea85e67eede43c0372f77 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 23 Apr 2025 16:52:57 +0800 Subject: ext4: clairfy the rules for modifying extents Add a comment at the beginning of extents_status.c to clarify the rules for loading, mapping, modifying, and removing extents and blocks. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250423085257.122685-10-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d1401d4a5513..31dc0496f8d0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -120,9 +120,40 @@ * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * + * ========================================================================== + * 3. Assurance of Ext4 extent status tree consistency + * + * When mapping blocks, Ext4 queries the extent status tree first and should + * always trusts that the extent status tree is consistent and up to date. + * Therefore, it is important to adheres to the following rules when createing, + * modifying and removing extents. + * + * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, + * the extent information should always be processed through the extent + * status tree instead of being organized manually through the on-disk + * extent tree. + * + * 2. When updating the extent tree, Ext4 should acquire the i_data_sem + * exclusively and update the extent status tree atomically. If the extents + * to be modified are large enough to exceed the range that a single + * i_data_sem can process (as ext4_datasem_ensure_credits() may drop + * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() + * does): + * + * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against page faults, as well as reads and writes that may + * concurrently modify the extent status tree. + * b) Evict all page cache in the affected range and recommend rebuilding + * or dropping the extent status tree after modifying the on-disk + * extent tree. This ensures exclusion against concurrent writebacks + * that do not hold those locks but only holds a folio lock. + * + * 3. Based on the rules above, when querying block mappings, Ext4 should at + * least hold the i_rwsem or invalidate_lock or folio lock(s) for the + * specified querying range. * * ========================================================================== - * 3. Performance analysis + * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are @@ -134,7 +165,7 @@ * * * ========================================================================== - * 4. TODO list + * 5. TODO list * * -- Refactor delayed space reservation * -- cgit v1.2.3-59-g8ed1b From d612a07931e261c537978aad096e7340b687cd0c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 23 Apr 2025 18:43:49 +0200 Subject: ext4: avoid -Wformat-security warning check_igot_inode() prints a variable string, which causes a harmless warning with 'make W=1': fs/ext4/inode.c:4763:45: error: format string is not a string literal (potentially insecure) [-Werror,-Wformat-security] 4763 | ext4_error_inode(inode, function, line, 0, err_str); Use a trivial "%s" format string instead. Signed-off-by: Arnd Bergmann Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250423164354.2780635-1-arnd@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b0bfbbdb50b..7237b6326722 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4809,7 +4809,7 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, return 0; error: - ext4_error_inode(inode, function, line, 0, err_str); + ext4_error_inode(inode, function, line, 0, "%s", err_str); return -EFSCORRUPTED; } -- cgit v1.2.3-59-g8ed1b From 32a93f5bc9b9812fc710f43a4d8a6830f91e4988 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 29 Apr 2025 19:55:36 +0200 Subject: ext4: fix calculation of credits for extent tree modification Luis and David are reporting that after running generic/750 test for 90+ hours on 2k ext4 filesystem, they are able to trigger a warning in jbd2_journal_dirty_metadata() complaining that there are not enough credits in the running transaction started in ext4_do_writepages(). Indeed the code in ext4_do_writepages() is racy and the extent tree can change between the time we compute credits necessary for extent tree computation and the time we actually modify the extent tree. Thus it may happen that the number of credits actually needed is higher. Modify ext4_ext_index_trans_blocks() to count with the worst case of maximum tree depth. This can reduce the possible number of writers that can operate in the system in parallel (because the credit estimates now won't fit in one transaction) but for reasonably sized journals this shouldn't really be an issue. So just go with a safe and simple fix. Link: https://lore.kernel.org/all/20250415013641.f2ppw6wov4kn4wq2@offworld Reported-by: Davidlohr Bueso Reported-by: Luis Chamberlain Tested-by: kdevops@lists.linux.dev Signed-off-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20250429175535.23125-2-jack@suse.cz Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 52bfc042bf4e..9053fe68ee4c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2399,18 +2399,19 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; - depth = ext_depth(inode); - + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ if (extents <= 1) - index = depth * 2; + index = EXT4_MAX_EXTENT_DEPTH * 2; else - index = depth * 3; + index = EXT4_MAX_EXTENT_DEPTH * 3; return index; } -- cgit v1.2.3-59-g8ed1b From e80325ef5cc2d9de13845ab32c50d5012357d42b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 11:16:04 +0200 Subject: ext4: use writeback_iter in ext4_journalled_submit_inode_data_buffers Use writeback_iter directly instead of write_cache_pages for a nicer code structure and less indirect calls. Signed-off-by: Christoph Hellwig Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20250505091604.3449879-1-hch@lst.de Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5bd81dd9751c..7c8fdb311bad 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -508,21 +508,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) ext4_maybe_update_superblock(sb); } -/* - * This writepage callback for write_cache_pages() - * takes care of a few cases after page cleaning. - * - * write_cache_pages() already checks for dirty pages - * and calls clear_page_dirty_for_io(), which we want, - * to write protect the pages. - * - * However, we may have to redirty a page (see below.) - */ -static int ext4_journalled_writepage_callback(struct folio *folio, - struct writeback_control *wbc, - void *data) +static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, + struct folio *folio) { - transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; @@ -543,15 +531,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != transaction || - jh->b_next_transaction))) { - folio_redirty_for_writepage(wbc, folio); - goto out; - } + (jh && (jh->b_transaction != jinode->i_transaction || + jh->b_next_transaction))) + return true; } while ((bh = bh->b_this_page) != head); -out: - return AOP_WRITEPAGE_ACTIVATE; + return false; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -563,10 +548,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; + struct folio *folio = NULL; + int error; - return write_cache_pages(mapping, &wbc, - ext4_journalled_writepage_callback, - jinode->i_transaction); + /* + * writeback_iter() already checks for dirty pages and calls + * folio_clear_dirty_for_io(), which we want to write protect the + * folios. + * + * However, we may have to redirty a folio sometimes. + */ + while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { + if (ext4_journalled_writepage_needs_redirty(jinode, folio)) + folio_redirty_for_writepage(&wbc, folio); + folio_unlock(folio); + } + + return error; } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) -- cgit v1.2.3-59-g8ed1b From af98b0157adf6504fade79b3e6cb260c4ff68e37 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Wed, 14 May 2025 22:08:55 +0900 Subject: jbd2: fix data-race and null-ptr-deref in jbd2_journal_dirty_metadata() Since handle->h_transaction may be a NULL pointer, so we should change it to call is_handle_aborted(handle) first before dereferencing it. And the following data-race was reported in my fuzzer: ================================================================== BUG: KCSAN: data-race in jbd2_journal_dirty_metadata / jbd2_journal_dirty_metadata write to 0xffff888011024104 of 4 bytes by task 10881 on cpu 1: jbd2_journal_dirty_metadata+0x2a5/0x770 fs/jbd2/transaction.c:1556 __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358 ext4_do_update_inode fs/ext4/inode.c:5220 [inline] ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869 __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074 ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103 .... read to 0xffff888011024104 of 4 bytes by task 10880 on cpu 0: jbd2_journal_dirty_metadata+0xf2/0x770 fs/jbd2/transaction.c:1512 __ext4_handle_dirty_metadata+0xe7/0x4b0 fs/ext4/ext4_jbd2.c:358 ext4_do_update_inode fs/ext4/inode.c:5220 [inline] ext4_mark_iloc_dirty+0x32c/0xd50 fs/ext4/inode.c:5869 __ext4_mark_inode_dirty+0xe1/0x450 fs/ext4/inode.c:6074 ext4_dirty_inode+0x98/0xc0 fs/ext4/inode.c:6103 .... value changed: 0x00000000 -> 0x00000001 ================================================================== This issue is caused by missing data-race annotation for jh->b_modified. Therefore, the missing annotation needs to be added. Reported-by: syzbot+de24c3fe3c4091051710@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=de24c3fe3c4091051710 Fixes: 6e06ae88edae ("jbd2: speedup jbd2_journal_dirty_metadata()") Signed-off-by: Jeongjun Park Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250514130855.99010-1-aha310510@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/transaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cbc4785462f5..c7867139af69 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1509,7 +1509,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction == transaction); spin_unlock(&jh->b_state_lock); } - if (jh->b_modified == 1) { + if (data_race(jh->b_modified == 1)) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { @@ -1528,7 +1528,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out; } - journal = transaction->t_journal; spin_lock(&jh->b_state_lock); if (is_handle_aborted(handle)) { @@ -1543,6 +1542,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out_unlock_bh; } + journal = transaction->t_journal; + if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part -- cgit v1.2.3-59-g8ed1b From b5e58bcd79625423487fa3ecba8e8411b5396327 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 6 May 2025 09:20:06 +0800 Subject: ext4: fix out of bounds punch offset Punching a hole with a start offset that exceeds max_end is not permitted and will result in a negative length in the truncate_inode_partial_folio() function while truncating the page cache, potentially leading to undesirable consequences. A simple reproducer: truncate -s 9895604649994 /mnt/foo xfs_io -c "pwrite 8796093022208 4096" /mnt/foo xfs_io -c "fpunch 8796093022213 25769803777" /mnt/foo kernel BUG at include/linux/highmem.h:275! Oops: invalid opcode: 0000 [#1] SMP PTI CPU: 3 UID: 0 PID: 710 Comm: xfs_io Not tainted 6.15.0-rc3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:zero_user_segments.constprop.0+0xd7/0x110 RSP: 0018:ffffc90001cf3b38 EFLAGS: 00010287 RAX: 0000000000000005 RBX: ffffea0001485e40 RCX: 0000000000001000 RDX: 000000000040b000 RSI: 0000000000000005 RDI: 000000000040b000 RBP: 000000000040affb R08: ffff888000000000 R09: ffffea0000000000 R10: 0000000000000003 R11: 00000000fffc7fc5 R12: 0000000000000005 R13: 000000000040affb R14: ffffea0001485e40 R15: ffff888031cd3000 FS: 00007f4f63d0b780(0000) GS:ffff8880d337d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000001ae0b038 CR3: 00000000536aa000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: truncate_inode_partial_folio+0x3dd/0x620 truncate_inode_pages_range+0x226/0x720 ? bdev_getblk+0x52/0x3e0 ? ext4_get_group_desc+0x78/0x150 ? crc32c_arch+0xfd/0x180 ? __ext4_get_inode_loc+0x18c/0x840 ? ext4_inode_csum+0x117/0x160 ? jbd2_journal_dirty_metadata+0x61/0x390 ? __ext4_handle_dirty_metadata+0xa0/0x2b0 ? kmem_cache_free+0x90/0x5a0 ? jbd2_journal_stop+0x1d5/0x550 ? __ext4_journal_stop+0x49/0x100 truncate_pagecache_range+0x50/0x80 ext4_truncate_page_cache_block_range+0x57/0x3a0 ext4_punch_hole+0x1fe/0x670 ext4_fallocate+0x792/0x17d0 ? __count_memcg_events+0x175/0x2a0 vfs_fallocate+0x121/0x560 ksys_fallocate+0x51/0xc0 __x64_sys_fallocate+0x24/0x40 x64_sys_call+0x18d2/0x4170 do_syscall_64+0xa7/0x220 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix this by filtering out cases where the punching start offset exceeds max_end. Fixes: 982bf37da09d ("ext4: refactor ext4_punch_hole()") Reported-by: Liebes Wang Closes: https://lore.kernel.org/linux-ext4/ac3a58f6-e686-488b-a9ee-fc041024e43d@huawei.com/ Tested-by: Liebes Wang Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250506012009.3896990-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7237b6326722..99f30b9cfe17 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4061,7 +4061,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) WARN_ON_ONCE(!inode_is_locked(inode)); /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) + if (offset >= inode->i_size || offset >= max_end) return 0; /* -- cgit v1.2.3-59-g8ed1b From 29ec9bed2395061350249ae356fb300dd82a78e7 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 6 May 2025 09:20:07 +0800 Subject: ext4: fix incorrect punch max_end For the extents based inodes, the maxbytes should be sb->s_maxbytes instead of sbi->s_bitmap_maxbytes. Additionally, for the calculation of max_end, the -sb->s_blocksize operation is necessary only for indirect-block based inodes. Correct the maxbytes and max_end value to correct the behavior of punch hole. Fixes: 2da376228a24 ("ext4: limit length to bitmap_maxbytes - blocksize in punch_hole") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250506012009.3896990-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 99f30b9cfe17..01038b4ecee0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4051,7 +4051,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; - loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + loff_t max_end = sb->s_maxbytes; loff_t end = offset + length; handle_t *handle; unsigned int credits; @@ -4060,14 +4060,20 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); + /* + * For indirect-block based inodes, make sure that the hole within + * one block before last range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + /* No need to punch hole beyond i_size */ if (offset >= inode->i_size || offset >= max_end) return 0; /* * If the hole extends beyond i_size, set the hole to end after - * the page that contains i_size, and also make sure that the hole - * within one block before last range. + * the page that contains i_size. */ if (end > inode->i_size) end = round_up(inode->i_size, PAGE_SIZE); -- cgit v1.2.3-59-g8ed1b From dbe27f06fa38b9bfc598f8864ae1c5d5831d9992 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 6 May 2025 09:20:08 +0800 Subject: ext4: factor out ext4_get_maxbytes() There are several locations that get the correct maxbytes value based on the inode's block type. It would be beneficial to extract a common helper function to make the code more clear. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250506012009.3896990-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 7 +++++++ fs/ext4/extents.c | 7 +------ fs/ext4/file.c | 7 +------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3f352f2a6d85..f329aa0517b4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3395,6 +3395,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) return 1 << sbi->s_log_groups_per_flex; } +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9053fe68ee4c..c42472fd4b63 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4941,12 +4941,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { - u64 maxbytes; - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - maxbytes = inode->i_sb->s_maxbytes; - else - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index beb078ee4811..b845a25f7932 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -929,12 +929,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; + loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: -- cgit v1.2.3-59-g8ed1b From 1a77a028a392fab66dd637cdfac3f888450d00af Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 6 May 2025 09:20:09 +0800 Subject: ext4: ensure i_size is smaller than maxbytes The inode i_size cannot be larger than maxbytes, check it while loading inode from the disk. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250506012009.3896990-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 01038b4ecee0..ca1f7a0dd8f8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4966,7 +4966,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; -- cgit v1.2.3-59-g8ed1b From fdbd0df9d4a36c9091d4f7e8cdfbe76c94aab0ab Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:12 +0800 Subject: ext4: make ext4_mpage_readpages() support large folios ext4_mpage_readpages() currently assumes that each folio is the size of PAGE_SIZE. Modify it to atomically calculate the number of blocks per folio and iterate through the blocks in each folio, which would allow for support of larger folios. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5d3a9dc9a32d..f329daf6e5c7 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode, int length; unsigned relative_block = 0; struct ext4_map_blocks map; - unsigned int nr_pages = rac ? readahead_count(rac) : 1; + unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - for (; nr_pages; nr_pages--) { + nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); + for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; - unsigned first_hole = blocks_per_page; + unsigned int first_hole; + unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; + blocks_per_folio = folio_size(folio) >> blkbits; + first_hole = blocks_per_folio; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; @@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode, map.m_flags &= ~EXT4_MAP_MAPPED; break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode, * Then do more ext4_map_blocks() calls until we are * done with this folio. */ - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; @@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode, } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode, /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } - if (first_hole != blocks_per_page) { + if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { @@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || - (first_hole != blocks_per_page)) { + (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else - last_block_in_bio = first_block + blocks_per_page - 1; + last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { -- cgit v1.2.3-59-g8ed1b From 16705e52e6291c6ab4249aaf1cdf0b8e88afe775 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:13 +0800 Subject: ext4: make regular file's buffered write path support large folios The current buffered write path in ext4 can only allocate and handle folios of PAGE_SIZE size. To support larger folios, modify ext4_da_write_begin() and ext4_write_begin() to allocate higher-order folios, and trim the write length if it exceeds the folio size. Additionally, in ext4_da_do_write_end(), use offset_in_folio() instead of PAGE_SIZE. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ca1f7a0dd8f8..fb0de28cefc2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1066,7 +1066,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; @@ -1080,8 +1080,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); @@ -1191,6 +1190,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; + fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -1203,8 +1203,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, @@ -1223,10 +1221,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -2965,6 +2971,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; + fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -2990,11 +2997,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -3083,7 +3094,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, unsigned long end; i_size_write(inode, new_i_size); - end = (new_i_size - 1) & (PAGE_SIZE - 1); + end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; -- cgit v1.2.3-59-g8ed1b From 2e9466fc5d7c74a0aeeb388c3c2a4c02b7069d58 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:14 +0800 Subject: ext4: make __ext4_block_zero_page_range() support large folio The partial block zero range helper __ext4_block_zero_page_range() currently only supports folios of PAGE_SIZE in size. The calculations for the start block and the offset within a folio for the given range are incorrect. Modify the implementation to use offset_in_folio() instead of directly masking PAGE_SIZE - 1, which will be able to support for large folios. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fb0de28cefc2..3596090f728a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3746,9 +3746,7 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3763,13 +3761,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; -- cgit v1.2.3-59-g8ed1b From d6bf294773a472fe4694b92714af482f5c6aa4c6 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:15 +0800 Subject: ext4/jbd2: convert jbd2_journal_blocks_per_page() to support large folio jbd2_journal_blocks_per_page() returns the number of blocks in a single page. Rename it to jbd2_journal_blocks_per_folio() and make it returns the number of blocks in the largest folio, preparing for the calculation of journal credits blocks when allocating blocks within a large folio in the writeback path. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.h | 4 ++-- fs/ext4/inode.c | 6 +++--- fs/jbd2/journal.c | 7 ++++--- include/linux/jbd2.h | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 3221714d9901..63d17c5201b5 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -319,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, revoke_creds, 0); } -static inline int ext4_journal_blocks_per_page(struct inode *inode) +static inline int ext4_journal_blocks_per_folio(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) - return jbd2_journal_blocks_per_page(inode); + return jbd2_journal_blocks_per_folio(inode); return 0; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3596090f728a..3a46f2822cf0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2406,7 +2406,7 @@ update_disksize: */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); @@ -2484,7 +2484,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -5883,7 +5883,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_writepage_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bfaa14bb1049..9700b5ea6f33 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(jbd2_log_wait_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); -EXPORT_SYMBOL(jbd2_journal_blocks_per_page); +EXPORT_SYMBOL(jbd2_journal_blocks_per_folio); EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); @@ -2655,9 +2655,10 @@ void jbd2_journal_ack_err(journal_t *journal) write_unlock(&journal->j_state_lock); } -int jbd2_journal_blocks_per_page(struct inode *inode) +int jbd2_journal_blocks_per_folio(struct inode *inode) { - return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) - + inode->i_sb->s_blocksize_bits); } /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 023e8abdb99a..ebbcdab474d5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1723,7 +1723,7 @@ static inline int tid_geq(tid_t x, tid_t y) return (difference >= 0); } -extern int jbd2_journal_blocks_per_page(struct inode *inode); +extern int jbd2_journal_blocks_per_folio(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) -- cgit v1.2.3-59-g8ed1b From 0e32d86170121c2d43508e923cb171fb58953b42 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:16 +0800 Subject: ext4: correct the journal credits calculations of allocating blocks The journal credits calculation in ext4_ext_index_trans_blocks() is currently inadequate. It only multiplies the depth of the extents tree and doesn't account for the blocks that may be required for adding the leaf extents themselves. After enabling large folios, we can easily run out of handle credits, triggering a warning in jbd2_journal_dirty_metadata() on filesystems with a 1KB block size. This occurs because we may need more extents when iterating through each large folio in ext4_do_writepages()->mpage_map_and_submit_extent(). Therefore, we should modify ext4_ext_index_trans_blocks() to include a count of the leaf extents in the worst case as well. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 5 +++-- fs/ext4/inode.c | 10 ++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c42472fd4b63..ddc3a7529961 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2409,9 +2409,10 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) * the time we actually modify the tree. Assume the worst case. */ if (extents <= 1) - index = EXT4_MAX_EXTENT_DEPTH * 2; + index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; else - index = EXT4_MAX_EXTENT_DEPTH * 3; + index = (EXT4_MAX_EXTENT_DEPTH * 3) + + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); return index; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3a46f2822cf0..18651f39a80b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5844,18 +5844,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and lead blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks + pextents; + groups = idxblocks; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -5863,7 +5861,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); -- cgit v1.2.3-59-g8ed1b From cd9f76de6ae9a5a0ca6201b1f06cf116a6a3e3a2 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:17 +0800 Subject: ext4: make the writeback path support large folios In mpage_map_and_submit_buffers(), the 'lblk' is now aligned to PAGE_SIZE. Convert it to be aligned to folio size. Additionally, modify the wbc->nr_to_write update to reduce the number of pages in a single folio, ensuring that the entire writeback path can support large folios. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 18651f39a80b..ed88e5844f46 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1983,7 +1983,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2206,7 +2206,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2217,6 +2216,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2442,7 +2442,7 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) -- cgit v1.2.3-59-g8ed1b From 01e807e18fd87937f515926b79152dfa4f13b735 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:18 +0800 Subject: ext4: make online defragmentation support large folios move_extent_per_page() currently assumes that each folio is the size of PAGE_SIZE and only copies data for one page. ext4_move_extents() should call move_extent_per_page() for each page. To support larger folios, simply modify the calculations for the block start and end offsets within the folio based on the provided range of 'data_offset_in_page' and 'block_len_in_page'. This function will continue to handle PAGE_SIZE of data at a time and will not convert this function to manage an entire folio. Additionally, we use the source folio to copy data, so it doesn't matter if the source and dest folios are different in size. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 48649be64d6a..1f8493a56e8f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; - int from = data_offset_in_page << orig_inode->i_blkbits; + int from; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; @@ -323,11 +323,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - - VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); - VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); - VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); - if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -360,6 +355,8 @@ again: goto unlock_folios; } data_copy: + from = offset_in_folio(folio[0], + orig_blk_offset << orig_inode->i_blkbits); *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; @@ -390,7 +387,7 @@ data_copy: if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); - for (i = 0; i < data_offset_in_page; i++) + for (i = 0; i < from >> orig_inode->i_blkbits; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); -- cgit v1.2.3-59-g8ed1b From 7ac67301e82f02b77a5c8e7377a1f414ef108b84 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:19 +0800 Subject: ext4: enable large folio for regular file Besides fsverity, fscrypt, and the data=journal mode, ext4 now supports large folios for regular files. Enable this feature by default. However, since we cannot change the folio order limitation of mappings on active inodes, setting the journal=data mode via ioctl on an active inode will not take immediate effect in non-delalloc mode. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-9-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/ext4_jbd2.c | 3 ++- fs/ext4/ialloc.c | 3 +++ fs/ext4/inode.c | 20 ++++++++++++++++++++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f329aa0517b4..cda06ed468ca 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2999,6 +2999,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +bool ext4_should_enable_large_folio(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 135e278c832e..b3e9b7bd7978 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { + !test_opt(inode->i_sb, DELALLOC) && + !mapping_large_folio_support(inode->i_mapping))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e7ecc7c8a729..4938e78cbadc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1336,6 +1336,9 @@ got: } } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ed88e5844f46..01e42cce572e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4829,6 +4829,23 @@ error: return -EFSCORRUPTED; } +bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -5147,6 +5164,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, -- cgit v1.2.3-59-g8ed1b From 6cbab5f95e49ec8a9f21784fae3ff0ee09b2dfbc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 22:38:06 -0700 Subject: ext4: remove sbi argument from ext4_chksum() Since ext4_chksum() no longer uses its sbi argument, remove it. Signed-off-by: Eric Biggers Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250513053809.699974-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/bitmap.c | 8 ++++---- fs/ext4/ext4.h | 3 +-- fs/ext4/extents.c | 3 +-- fs/ext4/fast_commit.c | 10 +++++----- fs/ext4/ialloc.c | 5 ++--- fs/ext4/inode.c | 19 ++++++++----------- fs/ext4/ioctl.c | 4 ++-- fs/ext4/mmp.c | 2 +- fs/ext4/namei.c | 10 ++++------ fs/ext4/orphan.c | 13 ++++++------- fs/ext4/super.c | 13 ++++++------- fs/ext4/xattr.c | 10 +++++----- 12 files changed, 45 insertions(+), 55 deletions(-) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index a4dbaccee6e7..87760fabdd2e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -30,7 +30,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); @@ -52,7 +52,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); @@ -71,7 +71,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); @@ -92,7 +92,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cda06ed468ca..f414a1a16d9a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2494,8 +2494,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH -static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ddc3a7529961..995be1051cfa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 1392241de5e6..42bee1d4f9f9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -734,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); - *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -781,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); - crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); @@ -2133,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); @@ -2166,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4938e78cbadc..79aa3df8d019 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1288,10 +1288,9 @@ got: __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 01e42cce572e..6c56735df4cb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -58,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -4922,10 +4920,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0e240013c84d..bef9a148e433 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -354,8 +354,8 @@ void ext4_reset_inode_seed(struct inode *inode) if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 3e26464b1425..51661570cf3b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -14,7 +14,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) int offset = offsetof(struct mmp_struct, mmp_checksum); __u32 csum; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); return cpu_to_le32(csum); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e9712e64ec8f..a178ac229489 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -346,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } @@ -442,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; @@ -450,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index c66e0cb29bd4..7c7f792ad6ab 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -541,9 +541,9 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, return 1; ot = ext4_orphan_block_tail(sb, bh); - calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } @@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, - inodes_per_ob * sizeof(__u32)); + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7c8fdb311bad..b4e8b17cb908 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -289,11 +289,10 @@ static int ext4_verify_csum_type(struct super_block *sb, __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; - csum = ext4_chksum(sbi, ~0, (char *)es, offset); + csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } @@ -3206,14 +3205,14 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __u32 csum32; __u16 dummy_csum = 0; - csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; @@ -4641,7 +4640,7 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_feature_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7ab8f2e8e815..8d15acbacc20 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); @@ -348,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { - return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) -- cgit v1.2.3-59-g8ed1b From 6017dbb7b67a3ca90d339ca32fe6dde425686435 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 22:38:07 -0700 Subject: ext4: remove sb argument from ext4_superblock_csum() Since ext4_superblock_csum() no longer uses its sb argument, remove it. Signed-off-by: Eric Biggers Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250513053809.699974-3-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +-- fs/ext4/ioctl.c | 4 ++-- fs/ext4/resize.c | 2 +- fs/ext4/super.c | 9 ++++----- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f414a1a16d9a..923580be3cd6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3136,8 +3136,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); -extern __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es); +extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bef9a148e433..5668a17458ae 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -143,7 +143,7 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); if (ext4_has_feature_metadata_csum(sb) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); @@ -151,7 +151,7 @@ static int ext4_update_backup_sb(struct super_block *sb, } func(es, arg); if (ext4_has_feature_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); unlock_buffer(bh); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b7ff0d955f0d..050f26168d97 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1119,7 +1119,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, es->s_block_group_nr = cpu_to_le16(group); if (ext4_has_feature_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b4e8b17cb908..9bf38e21dcda 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -286,8 +286,7 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -__le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct ext4_super_block *es) { int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; @@ -303,7 +302,7 @@ static int ext4_superblock_csum_verify(struct super_block *sb, if (!ext4_has_feature_metadata_csum(sb)) return 1; - return es->s_checksum == ext4_superblock_csum(sb, es); + return es->s_checksum == ext4_superblock_csum(es); } void ext4_superblock_csum_set(struct super_block *sb) @@ -313,7 +312,7 @@ void ext4_superblock_csum_set(struct super_block *sb) if (!ext4_has_feature_metadata_csum(sb)) return; - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -5912,7 +5911,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; -- cgit v1.2.3-59-g8ed1b From 76005718cf8bfdb6b0f818ea75ca6a4b3bee34cd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 22:38:08 -0700 Subject: jbd2: remove journal_t argument from jbd2_chksum() Since jbd2_chksum() no longer uses its journal_t argument, remove it. Signed-off-by: Eric Biggers Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250513053809.699974-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 6 +++--- fs/jbd2/journal.c | 8 ++++---- fs/jbd2/recovery.c | 10 +++++----- include/linux/jbd2.h | 3 +-- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1c7c49356878..7203d2d2624d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -99,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -330,8 +330,8 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, seq = cpu_to_be32(sequence); addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, addr, bh->b_size); kunmap_local(addr); if (jbd2_has_feature_csum3(j)) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9700b5ea6f33..883994a46d2b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -122,7 +122,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) old_csum = sb->s_checksum; sb->s_checksum = 0; - csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t)); sb->s_checksum = old_csum; return cpu_to_be32(csum); @@ -1000,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } @@ -1490,7 +1490,7 @@ static int journal_load_superblock(journal_t *journal) journal->j_total_len = be32_to_cpu(sb->s_maxlen); /* Precompute checksum seed for all metadata */ if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); /* After journal features are set, we can compute transaction limits */ jbd2_journal_init_transaction_limits(journal); @@ -2336,7 +2336,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index c271a050b7e6..cac8c2cd4a92 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -185,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; return provided == cpu_to_be32(calculated); @@ -440,7 +440,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) h = buf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; return provided == cpu_to_be32(calculated); @@ -461,7 +461,7 @@ static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf) h = tmpbuf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize); kfree(tmpbuf); return provided == cpu_to_be32(calculated); @@ -478,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return 1; seq = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, buf, j->j_blocksize); if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ebbcdab474d5..43b9297fe8a7 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1766,8 +1766,7 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -static inline u32 jbd2_chksum(journal_t *journal, u32 crc, - const void *address, unsigned int length) +static inline u32 jbd2_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } -- cgit v1.2.3-59-g8ed1b From fff6f35b9b2f0c79c9eb6106311530864d8f1394 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 22:38:09 -0700 Subject: jbd2: remove journal_t argument from jbd2_superblock_csum() Since jbd2_superblock_csum() no longer uses its journal_t argument, remove it. Signed-off-by: Eric Biggers Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250513053809.699974-5-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 883994a46d2b..6d5e76848733 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -115,7 +115,7 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +static __be32 jbd2_superblock_csum(journal_superblock_t *sb) { __u32 csum; __be32 old_csum; @@ -1384,7 +1384,7 @@ static int journal_check_superblock(journal_t *journal) } /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + if (sb->s_checksum != jbd2_superblock_csum(sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; return err; @@ -1819,7 +1819,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) set_buffer_uptodate(bh); } if (jbd2_journal_has_csum_v2or3(journal)) - sb->s_checksum = jbd2_superblock_csum(journal, sb); + sb->s_checksum = jbd2_superblock_csum(sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); -- cgit v1.2.3-59-g8ed1b From 9fa6121684dad974c8c2b2aceb0df2b27f0627fe Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:49 +0530 Subject: ext4: Document an edge case for overwrites ext4_iomap_overwrite_begin() clears the flag for IOMAP_WRITE before calling ext4_iomap_begin(). Document this above ext4_map_blocks() call as it is easy to miss it when focusing on write paths alone. Reviewed-by: Ojaswin Mujoo Acked-by: Darrick J. Wong Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/fd50ba05440042dff77d555e463a620a79f8d0e9.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6c56735df4cb..9f3b3db14eda 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3490,6 +3490,10 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } -- cgit v1.2.3-59-g8ed1b From 1c972b1d13dde34c9897d991283e2c54209b44e9 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:50 +0530 Subject: ext4: Check if inode uses extents in ext4_inode_can_atomic_write() EXT4 only supports doing atomic write on inodes which uses extents, so add a check in ext4_inode_can_atomic_write() which gets called during open. Reviewed-by: Ojaswin Mujoo Acked-by: Darrick J. Wong Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/86bb502c979398a736ab371d8f35f6866a477f6c.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 923580be3cd6..bc2deb5bfe77 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3870,7 +3870,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) static inline bool ext4_inode_can_atomic_write(struct inode *inode) { - return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; + return S_ISREG(inode->i_mode) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, -- cgit v1.2.3-59-g8ed1b From 255e7bc2127cbd3a718a55d2da5b2d3f015adcd7 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:51 +0530 Subject: ext4: Make ext4_meta_trans_blocks() non-static for later use Let's make ext4_meta_trans_blocks() non-static for use in later functions during ->end_io conversion for atomic writes. We will need this function to estimate journal credits for a special case. Instead of adding another wrapper around it, let's make this non-static. Reviewed-by: Ojaswin Mujoo Acked-by: Darrick J. Wong Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/23ce80d4286f792831ce99d13558182ee228fedb.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bc2deb5bfe77..8fda9cdfc13d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3045,6 +3045,8 @@ extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9f3b3db14eda..72dc1314925f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -140,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -5856,8 +5853,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; -- cgit v1.2.3-59-g8ed1b From 5bb12b1837c0bf7ddc84e27812f1693a126fe27a Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:52 +0530 Subject: ext4: Add support for EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS There can be a case where there are contiguous extents on the adjacent leaf nodes of on-disk extent trees. So when someone tries to write to this contiguous range, ext4_map_blocks() call will split by returning 1 extent at a time if this is not already cached in extent_status tree cache (where if these extents when cached can get merged since they are contiguous). This is fine for a normal write however in case of atomic writes, it can't afford to break the write into two. Now this is also something that will only happen in the slow write case where we call ext4_map_blocks() for each of these extents spread across different leaf nodes. However, there is no guarantee that these extent status cache cannot be reclaimed before the last call to ext4_map_blocks() in ext4_map_blocks_atomic_write_slow(). Hence this patch adds support of EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS. This flag checks if the requested range can be fully found in extent status cache and return. If not, it looks up in on-disk extent tree via ext4_map_query_blocks(). If the found extent is the last entry in the leaf node, then it goes and queries the next lblk to see if there is an adjacent contiguous extent in the adjacent leaf node of the on-disk extent tree. Even though there can be a case where there are multiple adjacent extent entries spread across multiple leaf nodes. But we only read an adjacent leaf block i.e. in total of 2 extent entries spread across 2 leaf nodes. The reason for this is that we are mostly only going to support atomic writes with upto 64KB or maybe max upto 1MB of atomic write support. Acked-by: Darrick J. Wong Co-developed-by: Ojaswin Mujoo Signed-off-by: Ojaswin Mujoo Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/6bb563e661f5fbd80e266a9e6ce6e29178f555f6.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 18 ++++++++++- fs/ext4/extents.c | 12 ++++++++ fs/ext4/inode.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 112 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8fda9cdfc13d..635a95a7f715 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -256,9 +256,19 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) +/* + * This is for use in ext4_map_query_blocks() for a special case where we can + * have a physically and logically contiguous blocks split across two leaf + * nodes instead of a single extent. This is required in case of atomic writes + * to know whether the returned extent is last in leaf. If yes, then lookup for + * next in leaf block in ext4_map_query_blocks_next_in_leaf(). + * - This is never going to be added to any buffer head state. + * - We use the next available bit after BH_BITMAP_UPTODATE. + */ +#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_DELAYED) + EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -728,6 +738,12 @@ enum { EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 +/* + * Atomic write caller needs this to query in the slow path of mixed mapping + * case, when a contiguous extent can be split across two adjacent leaf nodes. + * Look EXT4_MAP_QUERY_LAST_IN_LEAF. + */ +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 995be1051cfa..ba91ad3dfcbc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4437,6 +4437,18 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: + /* + * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. + * So we know that the depth used here is correct, since there was no + * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. + * If tomorrow we start using this QUERY flag with CREATE, then we will + * need to re-calculate the depth as it might have changed due to block + * allocation. + */ + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) + map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 72dc1314925f..14c19a38cd4f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -483,15 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + unsigned int orig_mlen = map->m_len; + unsigned int query_flags = flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF; flags &= EXT4_EX_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, flags); + retval = ext4_ext_map_blocks(handle, inode, map, + flags | query_flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); @@ -506,11 +564,23 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, WARN_ON(1); } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); - return retval; + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + ((map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) && + (map->m_len == orig_mlen))) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -624,6 +694,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -685,7 +756,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we -- cgit v1.2.3-59-g8ed1b From b86629c2b2998338b4a715058b402e47d0b36206 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:53 +0530 Subject: ext4: Add multi-fsblock atomic write support with bigalloc EXT4 supports bigalloc feature which allows the FS to work in size of clusters (group of blocks) rather than individual blocks. This patch adds atomic write support for bigalloc so that systems with bs = ps can also create FS using - mkfs.ext4 -F -O bigalloc -b 4096 -C 16384 With bigalloc ext4 can support multi-fsblock atomic writes. We will have to adjust ext4's atomic write unit max value to cluster size. This can then support atomic write of size anywhere between [blocksize, clustersize]. This patch adds the required changes to enable multi-fsblock atomic write support using bigalloc in the next patch. In this patch for block allocation: we first query the underlying region of the requested range by calling ext4_map_blocks() call. Here are the various cases which we then handle depending upon the underlying mapping type: 1. If the underlying region for the entire requested range is a mapped extent, then we don't call ext4_map_blocks() to allocate anything. We don't need to even start the jbd2 txn in this case. 2. For an append write case, we create a mapped extent. 3. If the underlying region is entirely a hole, then we create an unwritten extent for the requested range. 4. If the underlying region is a large unwritten extent, then we split the extent into 2 unwritten extent of required size. 5. If the underlying region has any type of mixed mapping, then we call ext4_map_blocks() in a loop to zero out the unwritten and the hole regions within the requested range. This then provide a single mapped extent type mapping for the requested range. Note: We invoke ext4_map_blocks() in a loop with the EXT4_GET_BLOCKS_ZERO flag only when the underlying extent mapping of the requested range is not entirely a hole, an unwritten extent, or a fully mapped extent. That is, if the underlying region contains a mix of hole(s), unwritten extent(s), and mapped extent(s), we use this loop to ensure that all the short mappings are zeroed out. This guarantees that the entire requested range becomes a single, uniformly mapped extent. It is ok to do so because we know this is being done on a bigalloc enabled filesystem where the block bitmap represents the entire cluster unit. Note having a single contiguous underlying region of type mapped, unwrittn or hole is not a problem. But the reason to avoid writing on top of mixed mapping region is because, atomic writes requires all or nothing should get written for the userspace pwritev2 request. So if at any point in time during the write if a crash or a sudden poweroff occurs, the region undergoing atomic write should read either complete old data or complete new data. But it should never have a mix of both old and new data. So, we first convert any mixed mapping region to a single contiguous mapped extent before any data gets written to it. This is because normally FS will only convert unwritten extents to written at the end of the write in ->end_io() call. And if we allow the writes over a mixed mapping and if a sudden power off happens in between, we will end up reading mix of new data (over mapped extents) and old data (over unwritten extents), because unwritten to written conversion never went through. So to avoid this and to avoid writes getting torned due to mixed mapping, we first allocate a single contiguous block mapping and then do the write. Acked-by: Darrick J. Wong Co-developed-by: Ojaswin Mujoo Signed-off-by: Ojaswin Mujoo Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/c4965ac3407cbc773f0bc954d0966d9696f5038a.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 + fs/ext4/extents.c | 87 +++++++++++++++++++++++ fs/ext4/file.c | 7 +- fs/ext4/inode.c | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 299 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 635a95a7f715..201afaaa508a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3751,6 +3751,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, + struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ba91ad3dfcbc..74c16736ae34 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4796,6 +4796,93 @@ out_inode_lock: return ret; } +/* + * This function converts a range of blocks to written extents. The caller of + * this function will pass the start offset and the size. all unwritten extents + * within this range will be converted to written extents. + * + * This function is called from the direct IO end io call back function for + * atomic writes, to convert the unwritten extents after IO is completed. + * + * Note that the requirement for atomic writes is that all conversion should + * happen atomically in a single fs journal transaction. We mainly only allocate + * unwritten extents either on a hole on a pre-exiting unwritten extent range in + * ext4_map_blocks_atomic_write(). The only case where we can have multiple + * unwritten extents in a range [offset, offset+len) is when there is a split + * unwritten extent between two leaf nodes which was cached in extent status + * cache during ext4_iomap_alloc() time. That will allow + * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going + * into the slow path. That means we might need a loop for conversion of this + * unwritten extent split across leaf block within a single journal transaction. + * Split extents across leaf nodes is a rare case, but let's still handle that + * to meet the requirements of multi-fsblock atomic writes. + * + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT; + + map.m_lblk = offset >> blkbits; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + + if (!handle) { + /* + * TODO: An optimization can be added later by having an extent + * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that + * it can tell if the extent in the cache is a split extent. + * But for now let's assume pextents as 2 always. + */ + credits = ext4_meta_trans_blocks(inode, max_blocks, 2); + } + + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } + + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret != max_blocks) + ext4_msg(inode->i_sb, KERN_INFO, + "inode #%lu: block %u: len %u: " + "split block mapping found for atomic write, " + "ret = %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + if (ret <= 0) + break; + } + + ret2 = ext4_mark_inode_dirty(handle, inode); + + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + + if (ret <= 0 || ret2) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "returned %d or %d", + inode->i_ino, map.m_lblk, + map.m_len, ret, ret2); + + return ret > 0 ? ret2 : ret; +} + /* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b845a25f7932..21df81347147 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 14c19a38cd4f..2f59036df800 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3467,12 +3467,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3480,7 +3617,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3511,7 +3671,11 @@ retry: else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3525,6 +3689,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3535,6 +3715,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3548,6 +3729,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3558,8 +3740,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { @@ -3580,6 +3770,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; -- cgit v1.2.3-59-g8ed1b From 642e0dc73c5def8270ff7c55d750ff36a6ea5d10 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:54 +0530 Subject: ext4: Enable support for ext4 multi-fsblock atomic write using bigalloc Last couple of patches added the needed support for multi-fsblock atomic writes using bigalloc. This patch ensures that filesystem advertizes the needed atomic write unit min and max values for enabling multi-fsblock atomic write support with bigalloc. Acked-by: Darrick J. Wong Co-developed-by: Ojaswin Mujoo Signed-off-by: Ojaswin Mujoo Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/5e45d7ed24499024b9079436ba6698dae5298e29.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9bf38e21dcda..a7f80ca01174 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4436,13 +4436,16 @@ static int ext4_handle_clustersize(struct super_block *sb) /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * With non-bigalloc filesystem awu will be based upon filesystem blocksize + * & bdev awu units. + * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. * @sb: super block - * TODO: Later add support for bigalloc */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; + unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); if (!bdev_can_atomic_write(bdev)) return; @@ -4452,7 +4455,7 @@ static void ext4_atomic_write_init(struct super_block *sb) sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); - sbi->s_awu_max = min(sb->s_blocksize, + sbi->s_awu_max = min(clustersize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { -- cgit v1.2.3-59-g8ed1b From 0bf1f51e34c4e260095fc7306c564c7fe01503e3 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 16 May 2025 01:20:55 +0530 Subject: ext4: Add atomic block write documentation Add an initial documentation around atomic writes support in ext4. Acked-by: Darrick J. Wong Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/d3893b9f5ad70317abae72046e81e4c180af91bf.1747337952.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/atomic_writes.rst | 225 +++++++++++++++++++++++ Documentation/filesystems/ext4/overview.rst | 1 + 2 files changed, 226 insertions(+) create mode 100644 Documentation/filesystems/ext4/atomic_writes.rst diff --git a/Documentation/filesystems/ext4/atomic_writes.rst b/Documentation/filesystems/ext4/atomic_writes.rst new file mode 100644 index 000000000000..f65767df3620 --- /dev/null +++ b/Documentation/filesystems/ext4/atomic_writes.rst @@ -0,0 +1,225 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _atomic_writes: + +Atomic Block Writes +------------------------- + +Introduction +~~~~~~~~~~~~ + +Atomic (untorn) block writes ensure that either the entire write is committed +to disk or none of it is. This prevents "torn writes" during power loss or +system crashes. The ext4 filesystem supports atomic writes (only with Direct +I/O) on regular files with extents, provided the underlying storage device +supports hardware atomic writes. This is supported in the following two ways: + +1. **Single-fsblock Atomic Writes**: + EXT4's supports atomic write operations with a single filesystem block since + v6.13. In this the atomic write unit minimum and maximum sizes are both set + to filesystem blocksize. + e.g. doing atomic write of 16KB with 16KB filesystem blocksize on 64KB + pagesize system is possible. + +2. **Multi-fsblock Atomic Writes with Bigalloc**: + EXT4 now also supports atomic writes spanning multiple filesystem blocks + using a feature known as bigalloc. The atomic write unit's minimum and + maximum sizes are determined by the filesystem block size and cluster size, + based on the underlying device’s supported atomic write unit limits. + +Requirements +~~~~~~~~~~~~ + +Basic requirements for atomic writes in ext4: + + 1. The extents feature must be enabled (default for ext4) + 2. The underlying block device must support atomic writes + 3. For single-fsblock atomic writes: + + 1. A filesystem with appropriate block size (up to the page size) + 4. For multi-fsblock atomic writes: + + 1. The bigalloc feature must be enabled + 2. The cluster size must be appropriately configured + +NOTE: EXT4 does not support software or COW based atomic write, which means +atomic writes on ext4 are only supported if underlying storage device supports +it. + +Multi-fsblock Implementation Details +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The bigalloc feature changes ext4 to allocate in units of multiple filesystem +blocks, also known as clusters. With bigalloc each bit within block bitmap +represents cluster (power of 2 number of blocks) rather than individual +filesystem blocks. +EXT4 supports multi-fsblock atomic writes with bigalloc, subject to the +following constraints. The minimum atomic write size is the larger of the fs +block size and the minimum hardware atomic write unit; and the maximum atomic +write size is smaller of the bigalloc cluster size and the maximum hardware +atomic write unit. Bigalloc ensures that all allocations are aligned to the +cluster size, which satisfies the LBA alignment requirements of the hardware +device if the start of the partition/logical volume is itself aligned correctly. + +Here is the block allocation strategy in bigalloc for atomic writes: + + * For regions with fully mapped extents, no additional work is needed + * For append writes, a new mapped extent is allocated + * For regions that are entirely holes, unwritten extent is created + * For large unwritten extents, the extent gets split into two unwritten + extents of appropriate requested size + * For mixed mapping regions (combinations of holes, unwritten extents, or + mapped extents), ext4_map_blocks() is called in a loop with + EXT4_GET_BLOCKS_ZERO flag to convert the region into a single contiguous + mapped extent by writing zeroes to it and converting any unwritten extents to + written, if found within the range. + +Note: Writing on a single contiguous underlying extent, whether mapped or +unwritten, is not inherently problematic. However, writing to a mixed mapping +region (i.e. one containing a combination of mapped and unwritten extents) +must be avoided when performing atomic writes. + +The reason is that, atomic writes when issued via pwritev2() with the RWF_ATOMIC +flag, requires that either all data is written or none at all. In the event of +a system crash or unexpected power loss during the write operation, the affected +region (when later read) must reflect either the complete old data or the +complete new data, but never a mix of both. + +To enforce this guarantee, we ensure that the write target is backed by +a single, contiguous extent before any data is written. This is critical because +ext4 defers the conversion of unwritten extents to written extents until the I/O +completion path (typically in ->end_io()). If a write is allowed to proceed over +a mixed mapping region (with mapped and unwritten extents) and a failure occurs +mid-write, the system could observe partially updated regions after reboot, i.e. +new data over mapped areas, and stale (old) data over unwritten extents that +were never marked written. This violates the atomicity and/or torn write +prevention guarantee. + +To prevent such torn writes, ext4 proactively allocates a single contiguous +extent for the entire requested region in ``ext4_iomap_alloc`` via +``ext4_map_blocks_atomic()``. EXT4 also force commits the current journalling +transaction in case if allocation is done over mixed mapping. This ensures any +pending metadata updates (like unwritten to written extents conversion) in this +range are in consistent state with the file data blocks, before performing the +actual write I/O. If the commit fails, the whole I/O must be aborted to prevent +from any possible torn writes. +Only after this step, the actual data write operation is performed by the iomap. + +Handling Split Extents Across Leaf Blocks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There can be a special edge case where we have logically and physically +contiguous extents stored in separate leaf nodes of the on-disk extent tree. +This occurs because on-disk extent tree merges only happens within the leaf +blocks except for a case where we have 2-level tree which can get merged and +collapsed entirely into the inode. +If such a layout exists and, in the worst case, the extent status cache entries +are reclaimed due to memory pressure, ``ext4_map_blocks()`` may never return +a single contiguous extent for these split leaf extents. + +To address this edge case, a new get block flag +``EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS flag`` is added to enhance the +``ext4_map_query_blocks()`` lookup behavior. + +This new get block flag allows ``ext4_map_blocks()`` to first check if there is +an entry in the extent status cache for the full range. +If not present, it consults the on-disk extent tree using +``ext4_map_query_blocks()``. +If the located extent is at the end of a leaf node, it probes the next logical +block (lblk) to detect a contiguous extent in the adjacent leaf. + +For now only one additional leaf block is queried to maintain efficiency, as +atomic writes are typically constrained to small sizes +(e.g. [blocksize, clustersize]). + + +Handling Journal transactions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To support multi-fsblock atomic writes, we ensure enough journal credits are +reserved during: + + 1. Block allocation time in ``ext4_iomap_alloc()``. We first query if there + could be a mixed mapping for the underlying requested range. If yes, then we + reserve credits of up to ``m_len``, assuming every alternate block can be + an unwritten extent followed by a hole. + + 2. During ``->end_io()`` call, we make sure a single transaction is started for + doing unwritten-to-written conversion. The loop for conversion is mainly + only required to handle a split extent across leaf blocks. + +How to +------ + +Creating Filesystems with Atomic Write Support +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +First check the atomic write units supported by block device. +See :ref:`atomic_write_bdev_support` for more details. + +For single-fsblock atomic writes with a larger block size +(on systems with block size < page size): + +.. code-block:: bash + + # Create an ext4 filesystem with a 16KB block size + # (requires page size >= 16KB) + mkfs.ext4 -b 16384 /dev/device + +For multi-fsblock atomic writes with bigalloc: + +.. code-block:: bash + + # Create an ext4 filesystem with bigalloc and 64KB cluster size + mkfs.ext4 -F -O bigalloc -b 4096 -C 65536 /dev/device + +Where ``-b`` specifies the block size, ``-C`` specifies the cluster size in bytes, +and ``-O bigalloc`` enables the bigalloc feature. + +Application Interface +~~~~~~~~~~~~~~~~~~~~~ + +Applications can use the ``pwritev2()`` system call with the ``RWF_ATOMIC`` flag +to perform atomic writes: + +.. code-block:: c + + pwritev2(fd, iov, iovcnt, offset, RWF_ATOMIC); + +The write must be aligned to the filesystem's block size and not exceed the +filesystem's maximum atomic write unit size. +See ``generic_atomic_write_valid()`` for more details. + +``statx()`` system call with ``STATX_WRITE_ATOMIC`` flag can provides following +details: + + * ``stx_atomic_write_unit_min``: Minimum size of an atomic write request. + * ``stx_atomic_write_unit_max``: Maximum size of an atomic write request. + * ``stx_atomic_write_segments_max``: Upper limit for segments. The number of + separate memory buffers that can be gathered into a write operation + (e.g., the iovcnt parameter for IOV_ITER). Currently, this is always set to one. + +The STATX_ATTR_WRITE_ATOMIC flag in ``statx->attributes`` is set if atomic +writes are supported. + +.. _atomic_write_bdev_support: + +Hardware Support +---------------- + +The underlying storage device must support atomic write operations. +Modern NVMe and SCSI devices often provide this capability. +The Linux kernel exposes this information through sysfs: + +* ``/sys/block//queue/atomic_write_unit_min`` - Minimum atomic write size +* ``/sys/block//queue/atomic_write_unit_max`` - Maximum atomic write size + +Nonzero values for these attributes indicate that the device supports +atomic writes. + +See Also +-------- + +* :doc:`bigalloc` - Documentation on the bigalloc feature +* :doc:`allocators` - Documentation on block allocation in ext4 +* Support for atomic block writes in 6.13: + https://lwn.net/Articles/1009298/ diff --git a/Documentation/filesystems/ext4/overview.rst b/Documentation/filesystems/ext4/overview.rst index 0fad6eda6e15..9d4054c17ecb 100644 --- a/Documentation/filesystems/ext4/overview.rst +++ b/Documentation/filesystems/ext4/overview.rst @@ -25,3 +25,4 @@ order. .. include:: inlinedata.rst .. include:: eainode.rst .. include:: verity.rst +.. include:: atomic_writes.rst -- cgit v1.2.3-59-g8ed1b From e26268ff1dcae5662c1b96c35f18cfa6ab73d9de Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 16 May 2025 13:38:00 -0400 Subject: ext4: only dirty folios when data journaling regular files fstest generic/388 occasionally reproduces a crash that looks as follows: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... Call Trace: ext4_block_zero_page_range+0x30c/0x380 [ext4] ext4_truncate+0x436/0x440 [ext4] ext4_process_orphan+0x5d/0x110 [ext4] ext4_orphan_cleanup+0x124/0x4f0 [ext4] ext4_fill_super+0x262d/0x3110 [ext4] get_tree_bdev_flags+0x132/0x1d0 vfs_get_tree+0x26/0xd0 vfs_cmd_create+0x59/0xe0 __do_sys_fsconfig+0x4ed/0x6b0 do_syscall_64+0x82/0x170 ... This occurs when processing a symlink inode from the orphan list. The partial block zeroing code in the truncate path calls ext4_dirty_journalled_data() -> folio_mark_dirty(). The latter calls mapping->a_ops->dirty_folio(), but symlink inodes are not assigned an a_ops vector in ext4, hence the crash. To avoid this problem, update the ext4_dirty_journalled_data() helper to only mark the folio dirty on regular files (for which a_ops is assigned). This also matches the journaling logic in the ext4_symlink() creation path, where ext4_handle_dirty_metadata() is called directly. Fixes: d84c9ebdac1e ("ext4: Mark pages with journalled data dirty") Signed-off-by: Brian Foster Link: https://patch.msgid.link/20250516173800.175577-1-bfoster@redhat.com Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org --- fs/ext4/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2f59036df800..ce0632094c50 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1119,7 +1119,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } -- cgit v1.2.3-59-g8ed1b From 3be4bb7e71477c0b9708bd6f1975db0ffc72cce4 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 19 May 2025 23:49:26 +0530 Subject: ext4: Unwritten to written conversion requires EXT4_EX_NOCACHE This fixes the atomic write patch series after it was rebased on top of extent status cache cleanup series i.e. 'commit 402e38e6b71f57 ("ext4: prevent stale extent cache entries caused by concurrent I/O writeback")' After the above series, EXT4_GET_BLOCKS_IO_CONVERT_EXT flag which has EXT4_GET_BLOCKS_IO_SUBMIT flag set, requires that the io submit context of any kind should pass EXT4_EX_NOCACHE to avoid caching unncecessary extents in the extent status cache. This patch fixes that by adding the EXT4_EX_NOCACHE flag in ext4_convert_unwritten_extents_atomic() for unwritten to written conversion calls to ext4_map_blocks(). Fixes: b86629c2b299 ("ext4: Add multi-fsblock atomic write support with bigalloc") Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/ea0ad9378ff6d31d73f4e53f87548e3a20817689.1747677758.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74c16736ae34..c1de66c23dd2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4827,7 +4827,7 @@ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; - int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); -- cgit v1.2.3-59-g8ed1b From 797ac3ca24ca1af396e84d16c79c523cfefe1737 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 19 May 2025 23:49:27 +0530 Subject: ext4: Simplify last in leaf check in ext4_map_query_blocks This simplifies the check for last in leaf in ext4_map_query_blocks() and fixes this cocci warning. cocci warnings: (new ones prefixed by >>) >> fs/ext4/inode.c:573:49-51: WARNING !A || A && B is equivalent to !A || B Fixes: 5bb12b1837c0 ("ext4: Add support for EXT4_GET_BLOCKS_QUERY_LEAF_BLOCKS") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505191524.auftmOwK-lkp@intel.com/ Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/5fd5c806218c83f603c578c95997cf7f6da29d74.1747677758.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce0632094c50..459ffc6af1d3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -570,8 +570,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, * - if the last in leaf is the full requested range */ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || - ((map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) && - (map->m_len == orig_mlen))) { + map->m_len == orig_mlen) { status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, -- cgit v1.2.3-59-g8ed1b From 9597376bdb6e5830448ba40aacd3ebd705fe35cc Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 19 May 2025 23:49:28 +0530 Subject: ext4: Rename and document EXT4_EX_FILTER to EXT4_EX_QUERY_FILTER Rename EXT4_EX_FILTER to EXT4_EX_QUERY_FILTER to better describe its purpose as a filter mask used specifically in ext4_map_query_blocks(). Add a comment explaining that this macro is used to filter flags needed when querying the on-disk extent tree. We will later use EXT4_EX_QUERY_FILTER mask to add another EXT4_GET_BLOCKS_QUERY needed to lookup in on-disk extent tree. Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/51f05d0ba286372eb8693af95bd4b10194b53141.1747677758.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 ++++++- fs/ext4/inode.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 201afaaa508a..c0489220d3c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -757,7 +757,12 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 -#define EXT4_EX_FILTER 0x70000000 +/* + * ext4_map_query_blocks() uses this filter mask to filter the flags needed to + * pass while lookup/querying of on disk extent tree. + */ +#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ + EXT4_EX_NOFAIL) /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 459ffc6af1d3..d662ff486a82 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -546,7 +546,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, unsigned int orig_mlen = map->m_len; unsigned int query_flags = flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF; - flags &= EXT4_EX_FILTER; + flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(handle, inode, map, flags | query_flags); -- cgit v1.2.3-59-g8ed1b From 618320daa9673ce2b6adae5ad6fbf16e878ff6c9 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 19 May 2025 23:49:29 +0530 Subject: ext4: Simplify flags in ext4_map_query_blocks() Now that we have EXT4_EX_QUERY_FILTER mask, let's use that to simplify the filtering of flags for passing to ext4_ext_map_blocks() in ext4_map_query_blocks() function. This allows us to kill the query_flags local variable which is not needed anymore. Signed-off-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/4ae735e83e6f43341e53e2d289e59156a8360134.1747677758.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/inode.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c0489220d3c4..18373de980f2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -762,7 +762,8 @@ enum { * pass while lookup/querying of on disk extent tree. */ #define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ - EXT4_EX_NOFAIL) + EXT4_EX_NOFAIL |\ + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d662ff486a82..5ddb65d6f8fb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -544,12 +544,10 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, unsigned int status; int retval; unsigned int orig_mlen = map->m_len; - unsigned int query_flags = flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF; flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, - flags | query_flags); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else retval = ext4_ind_map_blocks(handle, inode, map, flags); -- cgit v1.2.3-59-g8ed1b From 7acd1b315cdcc03b11a3aa1f9c9c85d99ddb4f0e Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 19 May 2025 23:49:30 +0530 Subject: ext4: Add a WARN_ON_ONCE for querying LAST_IN_LEAF instead We added the documentation in ext4_map_blocks() for usage of EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF flag. But It's better to add a WARN_ON_ONCE in case if anyone tries using this flag with CREATE to avoid a random issue later. Since depth can change with CREATE and it needs to be re-calculated before using it in there. Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/ee6e82a224c50b432df9ce1ce3333c50182d8473.1747677758.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c1de66c23dd2..b543a46fc809 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4445,9 +4445,11 @@ out: * need to re-calculate the depth as it might have changed due to block * allocation. */ - if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { + WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + } ext4_free_ext_path(path); -- cgit v1.2.3-59-g8ed1b